org.apache.spark.sql.types.BinaryType Scala Examples
The following examples show how to use org.apache.spark.sql.types.BinaryType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: AvroDataToCatalyst.scala From spark-schema-registry with Apache License 2.0 | 6 votes |
package com.hortonworks.spark.registry.avro import java.io.ByteArrayInputStream import com.hortonworks.registries.schemaregistry.{SchemaVersionInfo, SchemaVersionKey} import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer import org.apache.avro.Schema import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{BinaryType, DataType} import scala.collection.JavaConverters._ case class AvroDataToCatalyst(child: Expression, schemaName: String, version: Option[Int], config: Map[String, Object]) extends UnaryExpression with ExpectsInputTypes { override def inputTypes = Seq(BinaryType) @transient private lazy val srDeser: AvroSnapshotDeserializer = { val obj = new AvroSnapshotDeserializer() obj.init(config.asJava) obj } @transient private lazy val srSchema = fetchSchemaVersionInfo(schemaName, version) @transient private lazy val avroSchema = new Schema.Parser().parse(srSchema.getSchemaText) override lazy val dataType: DataType = SchemaConverters.toSqlType(avroSchema).dataType @transient private lazy val avroDeser= new AvroDeserializer(avroSchema, dataType) override def nullable: Boolean = true override def nullSafeEval(input: Any): Any = { val binary = input.asInstanceOf[Array[Byte]] val row = avroDeser.deserialize(srDeser.deserialize(new ByteArrayInputStream(binary), srSchema.getVersion)) val result = row match { case r: InternalRow => r.copy() case _ => row } result } override def simpleString: String = { s"from_sr(${child.sql}, ${dataType.simpleString})" } override def sql: String = { s"from_sr(${child.sql}, ${dataType.catalogString})" } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val expr = ctx.addReferenceObj("this", this) defineCodeGen(ctx, ev, input => s"(${ctx.boxedType(dataType)})$expr.nullSafeEval($input)") } private def fetchSchemaVersionInfo(schemaName: String, version: Option[Int]): SchemaVersionInfo = { val srClient = new SchemaRegistryClient(config.asJava) version.map(v => srClient.getSchemaVersionInfo(new SchemaVersionKey(schemaName, v))) .getOrElse(srClient.getLatestSchemaVersionInfo(schemaName)) } }
Example 2
Source File: MiscFunctionsSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.commons.codec.digest.DigestUtils import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, StringType, BinaryType} class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("md5") { checkEvaluation(Md5(Literal("ABC".getBytes)), "902fbdd2b1df0c4f70b4a5d23525e932") checkEvaluation(Md5(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), "6ac1e56bc78f031059be7be854522c4c") checkEvaluation(Md5(Literal.create(null, BinaryType)), null) checkConsistencyBetweenInterpretedAndCodegen(Md5, BinaryType) } test("sha1") { checkEvaluation(Sha1(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8") checkEvaluation(Sha1(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), "5d211bad8f4ee70e16c7d343a838fc344a1ed961") checkEvaluation(Sha1(Literal.create(null, BinaryType)), null) checkEvaluation(Sha1(Literal("".getBytes)), "da39a3ee5e6b4b0d3255bfef95601890afd80709") checkConsistencyBetweenInterpretedAndCodegen(Sha1, BinaryType) } test("sha2") { checkEvaluation(Sha2(Literal("ABC".getBytes), Literal(256)), DigestUtils.sha256Hex("ABC")) checkEvaluation(Sha2(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType), Literal(384)), DigestUtils.sha384Hex(Array[Byte](1, 2, 3, 4, 5, 6))) // unsupported bit length checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(1024)), null) checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(512)), null) checkEvaluation(Sha2(Literal("ABC".getBytes), Literal.create(null, IntegerType)), null) checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal.create(null, IntegerType)), null) } test("crc32") { checkEvaluation(Crc32(Literal("ABC".getBytes)), 2743272264L) checkEvaluation(Crc32(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), 2180413220L) checkEvaluation(Crc32(Literal.create(null, BinaryType)), null) checkConsistencyBetweenInterpretedAndCodegen(Crc32, BinaryType) } }
Example 3
Source File: Test.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog import org.apache.spark.sql.types.BinaryType object Test { def main(args: Array[String]) { val a: Array[Byte] = Array.fill(10)(Byte.MinValue) val b = Bytes.toBytes ("row003") System.arraycopy(b, 0, a, 0, b.length) val c = Bytes.toBytes(Int.MinValue) System.arraycopy(c, 0, a, b.length, c.length) val len = a.indexOf(HBaseTableCatalog.delimiter, 0) val s1 = Bytes.toString(a, 0, 6) val s2 = Bytes.toString(a, 0, len) val l = Array.fill(8)(Byte.MaxValue) Bytes.putDouble(l, 0, Double.MinValue) val m = Array.fill(8)(Byte.MaxValue) Bytes.putDouble(m, 0, -20.0) val n = Array.fill(8)(Byte.MaxValue) Bytes.putDouble(n, 0, 0.0) val o = Array.fill(8)(Byte.MaxValue) Bytes.putDouble(o, 0, 20.0) val p = Array.fill(8)(Byte.MaxValue) Bytes.putDouble(p, 0, Double.MaxValue) val c1 = BinaryType.ordering.compare(l, m) val c2 = BinaryType.ordering.compare(m, n) val c3 = BinaryType.ordering.compare(n, o) val c4 = BinaryType.ordering.compare(o, p) val p1 = Array.fill(10)(0: Byte) Bytes.putBytes(p1, 0, Bytes.toBytes("row010"), 0, 6) val p2 = Array.fill(10)(-1: Byte) Bytes.putBytes(p2, 0, Bytes.toBytes("row010"), 0, 6) val p3 = Array.fill(10)(Byte.MaxValue) Bytes.putBytes(p3, 0, Bytes.toBytes("row010"), 0, 6) Bytes.putInt(p3, 6, 10) val p4 = Bytes.compareTo(p1, p3) val p5 = Bytes.compareTo(p2, p3) val z = Array.fill(4)(Byte.MinValue) Bytes.putInt(z, 0, -1) val z1 = Array.fill(4)(Byte.MinValue) Bytes.putInt(z1, 0, -2147483648) val z2 = Bytes.compareTo(z, z1) val t = Array.fill(4)(-1: Byte) println(Bytes.toInt(t)) val s = Bytes.toBytes(1.4.asInstanceOf[Float]) println(Bytes.toInt(s)) println(Bytes.toFloat(s)) val w = Bytes.toBytes(-1.4.asInstanceOf[Float]) println(Bytes.toInt(w)) println(Bytes.toFloat(w)) val buffer1 = Bytes.toBytes(-1.0f) val b1 = Bytes.toInt(buffer1) var buffer = Array.fill(4)(-1: Byte) var buffer2 = Bytes.toBytes(-1.0f) var buffer3 = java.lang.Float.floatToIntBits(-1.0f) val b3 = Bytes.toBytes(buffer3) val out = Bytes.toInt(buffer1) ^ Integer.MIN_VALUE buffer2 = Bytes.toBytes(out) var i: Int = java.lang.Float.floatToIntBits(-1.0f) i = (i ^ ((i >> Integer.SIZE - 1) | Integer.MIN_VALUE)) + 1 Bytes.putInt(buffer, 0, i) val mn = Bytes.toBytes(-0.0f) println(Bytes.toFloat(mn)) println(Float.MinPositiveValue) println(s"a") } }
Example 4
Source File: Test.scala From shc with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog import org.apache.spark.sql.types.BinaryType object Test { def main(args: Array[String]) { val a: Array[Byte] = Array.fill(10)(Byte.MinValue) val b = Bytes.toBytes ("row003") System.arraycopy(b, 0, a, 0, b.length) val c = Bytes.toBytes(Int.MinValue) System.arraycopy(c, 0, a, b.length, c.length) val len = a.indexOf(HBaseTableCatalog.delimiter, 0) val s1 = Bytes.toString(a, 0, 6) val s2 = Bytes.toString(a, 0, len) val l = Array.fill(8)(Byte.MaxValue) Bytes.putDouble(l, 0, Double.MinValue) val m = Array.fill(8)(Byte.MaxValue) Bytes.putDouble(m, 0, -20.0) val n = Array.fill(8)(Byte.MaxValue) Bytes.putDouble(n, 0, 0.0) val o = Array.fill(8)(Byte.MaxValue) Bytes.putDouble(o, 0, 20.0) val p = Array.fill(8)(Byte.MaxValue) Bytes.putDouble(p, 0, Double.MaxValue) val c1 = BinaryType.ordering.compare(l, m) val c2 = BinaryType.ordering.compare(m, n) val c3 = BinaryType.ordering.compare(n, o) val c4 = BinaryType.ordering.compare(o, p) val p1 = Array.fill(10)(0: Byte) Bytes.putBytes(p1, 0, Bytes.toBytes("row010"), 0, 6) val p2 = Array.fill(10)(-1: Byte) Bytes.putBytes(p2, 0, Bytes.toBytes("row010"), 0, 6) val p3 = Array.fill(10)(Byte.MaxValue) Bytes.putBytes(p3, 0, Bytes.toBytes("row010"), 0, 6) Bytes.putInt(p3, 6, 10) val p4 = Bytes.compareTo(p1, p3) val p5 = Bytes.compareTo(p2, p3) val z = Array.fill(4)(Byte.MinValue) Bytes.putInt(z, 0, -1) val z1 = Array.fill(4)(Byte.MinValue) Bytes.putInt(z1, 0, -2147483648) val z2 = Bytes.compareTo(z, z1) val t = Array.fill(4)(-1: Byte) println(Bytes.toInt(t)) val s = Bytes.toBytes(1.4.asInstanceOf[Float]) println(Bytes.toInt(s)) println(Bytes.toFloat(s)) val w = Bytes.toBytes(-1.4.asInstanceOf[Float]) println(Bytes.toInt(w)) println(Bytes.toFloat(w)) val buffer1 = Bytes.toBytes(-1.0f) val b1 = Bytes.toInt(buffer1) var buffer = Array.fill(4)(-1: Byte) var buffer2 = Bytes.toBytes(-1.0f) var buffer3 = java.lang.Float.floatToIntBits(-1.0f) val b3 = Bytes.toBytes(buffer3) val out = Bytes.toInt(buffer1) ^ Integer.MIN_VALUE buffer2 = Bytes.toBytes(out) var i: Int = java.lang.Float.floatToIntBits(-1.0f) i = (i ^ ((i >> Integer.SIZE - 1) | Integer.MIN_VALUE)) + 1 Bytes.putInt(buffer, 0, i) val mn = Bytes.toBytes(-0.0f) println(Bytes.toFloat(mn)) println(Float.MinPositiveValue) println(s"a") } }
Example 5
Source File: BinaryDeserializer.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.deserializers.generic import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.BinaryType import com.paypal.gimel.deserializers.generic.conf.{GenericDeserializerConfigs, GenericDeserializerConfiguration, GenericDeserializerConstants} import com.paypal.gimel.serde.common.Deserializer class BinaryDeserializer extends Deserializer { override def deserialize(dataframe: DataFrame, props: Map[String, Any] = Map.empty): DataFrame = { val conf = new GenericDeserializerConfiguration(props) if (!dataframe.columns.contains(conf.columnToDeserialize)) { throw new IllegalArgumentException( s""" | Column to Deserialize does not exist in dataframe --> ${conf.columnToDeserialize} | Please set the property ${GenericDeserializerConfigs.columnToDeserializeKey} | Note: Default value is "${GenericDeserializerConstants.columnToDeserialize}" """.stripMargin ) } else { val kafkaValueMessageColAlias = "valueBinary" val deserializedDF = dataframe.withColumn(kafkaValueMessageColAlias, dataframe(conf.columnToDeserialize).cast(BinaryType)) deserializedDF.drop(conf.columnToDeserialize).withColumnRenamed(kafkaValueMessageColAlias, conf.columnToDeserialize) } } }
Example 6
Source File: CatalystDataToAvro.scala From spark-schema-registry with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.registry.avro import com.hortonworks.registries.schemaregistry.{SchemaCompatibility, SchemaMetadata} import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotSerializer import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{BinaryType, DataType} import scala.collection.JavaConverters._ case class CatalystDataToAvro( child: Expression, schemaName: String, recordName: String, nameSpace: String, config: Map[String, Object] ) extends UnaryExpression { override def dataType: DataType = BinaryType private val topLevelRecordName = if (recordName == "") schemaName else recordName @transient private lazy val avroType = SchemaConverters.toAvroType(child.dataType, child.nullable, topLevelRecordName, nameSpace) @transient private lazy val avroSer = new AvroSerializer(child.dataType, avroType, child.nullable) @transient private lazy val srSer: AvroSnapshotSerializer = { val obj = new AvroSnapshotSerializer() obj.init(config.asJava) obj } @transient private lazy val srClient = new SchemaRegistryClient(config.asJava) @transient private lazy val schemaMetadata = { var schemaMetadataInfo = srClient.getSchemaMetadataInfo(schemaName) if (schemaMetadataInfo == null) { val generatedSchemaMetadata = new SchemaMetadata.Builder(schemaName). `type`(AvroSchemaProvider.TYPE) .schemaGroup("Autogenerated group") .description("Autogenerated schema") .compatibility(SchemaCompatibility.BACKWARD).build srClient.addSchemaMetadata(generatedSchemaMetadata) generatedSchemaMetadata } else { schemaMetadataInfo.getSchemaMetadata } } override def nullSafeEval(input: Any): Any = { val avroData = avroSer.serialize(input) srSer.serialize(avroData.asInstanceOf[Object], schemaMetadata) } override def simpleString: String = { s"to_sr(${child.sql}, ${child.dataType.simpleString})" } override def sql: String = { s"to_sr(${child.sql}, ${child.dataType.catalogString})" } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val expr = ctx.addReferenceObj("this", this) defineCodeGen(ctx, ev, input => s"(byte[]) $expr.nullSafeEval($input)") } }
Example 7
Source File: MiscFunctionsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.commons.codec.digest.DigestUtils import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, StringType, BinaryType} class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("md5") { checkEvaluation(Md5(Literal("ABC".getBytes)), "902fbdd2b1df0c4f70b4a5d23525e932") checkEvaluation(Md5(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), "6ac1e56bc78f031059be7be854522c4c") checkEvaluation(Md5(Literal.create(null, BinaryType)), null) checkConsistencyBetweenInterpretedAndCodegen(Md5, BinaryType) } test("sha1") { checkEvaluation(Sha1(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8") checkEvaluation(Sha1(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), "5d211bad8f4ee70e16c7d343a838fc344a1ed961") checkEvaluation(Sha1(Literal.create(null, BinaryType)), null) checkEvaluation(Sha1(Literal("".getBytes)), "da39a3ee5e6b4b0d3255bfef95601890afd80709") checkConsistencyBetweenInterpretedAndCodegen(Sha1, BinaryType) } test("sha2") { checkEvaluation(Sha2(Literal("ABC".getBytes), Literal(256)), DigestUtils.sha256Hex("ABC")) checkEvaluation(Sha2(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType), Literal(384)), DigestUtils.sha384Hex(Array[Byte](1, 2, 3, 4, 5, 6))) // unsupported bit length checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(1024)), null) checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(512)), null) checkEvaluation(Sha2(Literal("ABC".getBytes), Literal.create(null, IntegerType)), null) checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal.create(null, IntegerType)), null) } test("crc32") { checkEvaluation(Crc32(Literal("ABC".getBytes)), 2743272264L) checkEvaluation(Crc32(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)), 2180413220L) checkEvaluation(Crc32(Literal.create(null, BinaryType)), null) checkConsistencyBetweenInterpretedAndCodegen(Crc32, BinaryType) } }
Example 8
Source File: KinesisWriteTask.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import java.nio.ByteBuffer import com.amazonaws.services.kinesis.producer.{KinesisProducer, UserRecordResult} import com.google.common.util.concurrent.{FutureCallback, Futures} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, UnsafeProjection} import org.apache.spark.sql.types.{BinaryType, StringType} private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, String], inputSchema: Seq[Attribute]) extends Logging { private var producer: KinesisProducer = _ private val projection = createProjection private val streamName = producerConfiguration.getOrElse( KinesisSourceProvider.SINK_STREAM_NAME_KEY, "") def execute(iterator: Iterator[InternalRow]): Unit = { producer = CachedKinesisProducer.getOrCreate(producerConfiguration) while (iterator.hasNext) { val currentRow = iterator.next() val projectedRow = projection(currentRow) val partitionKey = projectedRow.getString(0) val data = projectedRow.getBinary(1) sendData(partitionKey, data) } } def sendData(partitionKey: String, data: Array[Byte]): String = { var sentSeqNumbers = new String val future = producer.addUserRecord(streamName, partitionKey, ByteBuffer.wrap(data)) val kinesisCallBack = new FutureCallback[UserRecordResult]() { override def onFailure(t: Throwable): Unit = { logError(s"Writing to $streamName failed due to ${t.getCause}") } override def onSuccess(result: UserRecordResult): Unit = { val shardId = result.getShardId sentSeqNumbers = result.getSequenceNumber } } Futures.addCallback(future, kinesisCallBack) producer.flushSync() sentSeqNumbers } def close(): Unit = { if (producer != null) { producer.flush() producer = null } } private def createProjection: UnsafeProjection = { val partitionKeyExpression = inputSchema .find(_.name == KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME).getOrElse( throw new IllegalStateException("Required attribute " + s"'${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME}' not found")) partitionKeyExpression.dataType match { case StringType | BinaryType => // ok case t => throw new IllegalStateException(s"${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME} " + "attribute type must be a String or BinaryType") } val dataExpression = inputSchema.find(_.name == KinesisWriter.DATA_ATTRIBUTE_NAME).getOrElse( throw new IllegalStateException("Required attribute " + s"'${KinesisWriter.DATA_ATTRIBUTE_NAME}' not found") ) dataExpression.dataType match { case StringType | BinaryType => // ok case t => throw new IllegalStateException(s"${KinesisWriter.DATA_ATTRIBUTE_NAME} " + "attribute type must be a String or BinaryType") } UnsafeProjection.create( Seq(Cast(partitionKeyExpression, StringType), Cast(dataExpression, StringType)), inputSchema) } }
Example 9
Source File: KafkaWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.{util => ju} import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} import org.apache.spark.sql.types.{BinaryType, StringType} import org.apache.spark.util.Utils private[kafka010] object KafkaWriter extends Logging { val TOPIC_ATTRIBUTE_NAME: String = "topic" val KEY_ATTRIBUTE_NAME: String = "key" val VALUE_ATTRIBUTE_NAME: String = "value" override def toString: String = "KafkaWriter" def validateQuery( schema: Seq[Attribute], kafkaParameters: ju.Map[String, Object], topic: Option[String] = None): Unit = { schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse( if (topic.isEmpty) { throw new AnalysisException(s"topic option required when no " + s"'$TOPIC_ATTRIBUTE_NAME' attribute is present. Use the " + s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a topic.") } else { Literal(topic.get, StringType) } ).dataType match { case StringType => // good case _ => throw new AnalysisException(s"Topic type must be a String") } schema.find(_.name == KEY_ATTRIBUTE_NAME).getOrElse( Literal(null, StringType) ).dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException(s"$KEY_ATTRIBUTE_NAME attribute type " + s"must be a String or BinaryType") } schema.find(_.name == VALUE_ATTRIBUTE_NAME).getOrElse( throw new AnalysisException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found") ).dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException(s"$VALUE_ATTRIBUTE_NAME attribute type " + s"must be a String or BinaryType") } } def write( sparkSession: SparkSession, queryExecution: QueryExecution, kafkaParameters: ju.Map[String, Object], topic: Option[String] = None): Unit = { val schema = queryExecution.analyzed.output validateQuery(schema, kafkaParameters, topic) queryExecution.toRdd.foreachPartition { iter => val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic) Utils.tryWithSafeFinally(block = writeTask.execute(iter))( finallyBlock = writeTask.close()) } } }
Example 10
Source File: ArrowSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.rdd.function.summarize.summarizer.{ ArrowSummarizerResult, ArrowSummarizerState, ArrowSummarizer => ArrowSum } import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.ColumnList import com.twosigma.flint.timeseries.summarize.{ ColumnList, InputAlwaysValid, Summarizer, SummarizerFactory } import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types.{ ArrayType, BinaryType, StructType } object ArrowSummarizer { val baseRowsColumnName = "__baseRows" val arrowBatchColumnName = "arrow_bytes" } case class ArrowSummarizerFactory(columns: Seq[String], includeBaseRows: Boolean) extends SummarizerFactory { override val requiredColumns: ColumnList = if (includeBaseRows) { ColumnList.All } else { ColumnList.Sequence(columns) } override def apply(inputSchema: StructType): ArrowSummarizer = { val outputBatchSchema = StructType(columns.map(col => inputSchema(inputSchema.fieldIndex(col)))) ArrowSummarizer(inputSchema, outputBatchSchema, includeBaseRows, prefixOpt, requiredColumns) } } case class ArrowSummarizer( override val inputSchema: StructType, outputBatchSchema: StructType, includeBaseRows: Boolean, override val prefixOpt: Option[String], requiredColumns: ColumnList ) extends Summarizer with InputAlwaysValid { override type T = InternalRow override type U = ArrowSummarizerState override type V = ArrowSummarizerResult override val summarizer = ArrowSum(inputSchema, outputBatchSchema, includeBaseRows) override val schema: StructType = if (includeBaseRows) { Schema.of( ArrowSummarizer.baseRowsColumnName -> ArrayType(inputSchema), ArrowSummarizer.arrowBatchColumnName -> BinaryType ) } else { Schema.of( ArrowSummarizer.arrowBatchColumnName -> BinaryType ) } override def toT(r: InternalRow): T = r override def fromV(v: V): InternalRow = if (includeBaseRows) { InternalRow(new GenericArrayData(v.baseRows), v.arrowBatch) } else { InternalRow(v.arrowBatch) } }
Example 11
Source File: GenerateOrdering.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.Logging import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.{BinaryType, StringType, NumericType} object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] with Logging { import scala.reflect.runtime.{universe => ru} import scala.reflect.runtime.universe._ protected def canonicalize(in: Seq[SortOrder]): Seq[SortOrder] = in.map(ExpressionCanonicalizer.execute(_).asInstanceOf[SortOrder]) protected def bind(in: Seq[SortOrder], inputSchema: Seq[Attribute]): Seq[SortOrder] = in.map(BindReferences.bindReference(_, inputSchema)) protected def create(ordering: Seq[SortOrder]): Ordering[Row] = { val a = newTermName("a") val b = newTermName("b") val comparisons = ordering.zipWithIndex.map { case (order, i) => val evalA = expressionEvaluator(order.child) val evalB = expressionEvaluator(order.child) val compare = order.child.dataType match { case BinaryType => q""" val x = ${if (order.direction == Ascending) evalA.primitiveTerm else evalB.primitiveTerm} val y = ${if (order.direction != Ascending) evalB.primitiveTerm else evalA.primitiveTerm} var i = 0 while (i < x.length && i < y.length) { val res = x(i).compareTo(y(i)) if (res != 0) return res i = i+1 } return x.length - y.length """ case _: NumericType => q""" val comp = ${evalA.primitiveTerm} - ${evalB.primitiveTerm} if(comp != 0) { return ${if (order.direction == Ascending) q"comp.toInt" else q"-comp.toInt"} } """ case StringType => if (order.direction == Ascending) { q"""return ${evalA.primitiveTerm}.compare(${evalB.primitiveTerm})""" } else { q"""return ${evalB.primitiveTerm}.compare(${evalA.primitiveTerm})""" } } q""" i = $a ..${evalA.code} i = $b ..${evalB.code} if (${evalA.nullTerm} && ${evalB.nullTerm}) { // Nothing } else if (${evalA.nullTerm}) { return ${if (order.direction == Ascending) q"-1" else q"1"} } else if (${evalB.nullTerm}) { return ${if (order.direction == Ascending) q"1" else q"-1"} } else { $compare } """ } val q"class $orderingName extends $orderingType { ..$body }" = reify { class SpecificOrdering extends Ordering[Row] { val o = ordering } }.tree.children.head val code = q""" class $orderingName extends $orderingType { ..$body def compare(a: $rowType, b: $rowType): Int = { var i: $rowType = null // Holds current row being evaluated. ..$comparisons return 0 } } new $orderingName() """ logDebug(s"Generated Ordering: $code") toolBox.eval(code).asInstanceOf[Ordering[Row]] } }
Example 12
Source File: EncodeLong.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.expressions import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, NullIntolerant, UnaryExpression} import org.apache.spark.sql.types.{BinaryType, DataType, LongType} import org.opencypher.morpheus.api.value.MorpheusElement._ case class EncodeLong(child: Expression) extends UnaryExpression with NullIntolerant with ExpectsInputTypes { override val dataType: DataType = BinaryType override val inputTypes: Seq[LongType] = Seq(LongType) override protected def nullSafeEval(input: Any): Any = EncodeLong.encodeLong(input.asInstanceOf[Long]) override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = defineCodeGen(ctx, ev, c => s"(byte[])(${EncodeLong.getClass.getName.dropRight(1)}.encodeLong($c))") } object EncodeLong { private final val moreBytesBitMask: Long = Integer.parseInt("10000000", 2) private final val varLength7BitMask: Long = Integer.parseInt("01111111", 2) private final val otherBitsMask = ~varLength7BitMask private final val maxBytesForLongVarEncoding = 10 // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding @inline final def encodeLong(l: Long): Array[Byte] = { val tempResult = new Array[Byte](maxBytesForLongVarEncoding) var remainder = l var index = 0 while ((remainder & otherBitsMask) != 0) { tempResult(index) = ((remainder & varLength7BitMask) | moreBytesBitMask).toByte remainder >>>= 7 index += 1 } tempResult(index) = remainder.toByte val result = new Array[Byte](index + 1) System.arraycopy(tempResult, 0, result, 0, index + 1) result } // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding @inline final def decodeLong(input: Array[Byte]): Long = { assert(input.nonEmpty, "`decodeLong` requires a non-empty array as its input") var index = 0 var currentByte = input(index) var decoded = currentByte & varLength7BitMask var nextLeftShift = 7 while ((currentByte & moreBytesBitMask) != 0) { index += 1 currentByte = input(index) decoded |= (currentByte & varLength7BitMask) << nextLeftShift nextLeftShift += 7 } assert(index == input.length - 1, s"`decodeLong` received an input array ${input.toSeq.toHex} with extra bytes that could not be decoded.") decoded } implicit class ColumnLongOps(val c: Column) extends AnyVal { def encodeLongAsMorpheusId(name: String): Column = encodeLongAsMorpheusId.as(name) def encodeLongAsMorpheusId: Column = new Column(EncodeLong(c.expr)) } }
Example 13
Source File: MorpheusGraphExport.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.api.io.util import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{BinaryType, StructField, StructType} import org.opencypher.morpheus.api.io.{GraphElement, Relationship} import org.opencypher.morpheus.impl.convert.SparkConversions._ import org.opencypher.morpheus.impl.table.SparkTable.DataFrameTable import org.opencypher.okapi.api.schema.PropertyGraphSchema import org.opencypher.okapi.api.types.{CTNode, CTRelationship} import org.opencypher.okapi.impl.util.StringEncodingUtilities._ import org.opencypher.okapi.ir.api.expr.{Property, Var} import org.opencypher.okapi.relational.api.graph.RelationalCypherGraph // TODO: Add documentation that describes the canonical table format object MorpheusGraphExport { implicit class CanonicalTableSparkSchema(val schema: PropertyGraphSchema) extends AnyVal { def canonicalNodeStructType(labels: Set[String]): StructType = { val id = StructField(GraphElement.sourceIdKey, BinaryType, nullable = false) val properties = schema.nodePropertyKeys(labels).toSeq .map { case (propertyName, cypherType) => propertyName.toPropertyColumnName -> cypherType } .sortBy { case (propertyColumnName, _) => propertyColumnName } .map { case (propertyColumnName, cypherType) => StructField(propertyColumnName, cypherType.getSparkType, cypherType.isNullable) } StructType(id +: properties) } def canonicalRelStructType(relType: String): StructType = { val id = StructField(GraphElement.sourceIdKey, BinaryType, nullable = false) val sourceId = StructField(Relationship.sourceStartNodeKey, BinaryType, nullable = false) val targetId = StructField(Relationship.sourceEndNodeKey, BinaryType, nullable = false) val properties = schema.relationshipPropertyKeys(relType).toSeq.sortBy(_._1).map { case (propertyName, cypherType) => StructField(propertyName.toPropertyColumnName, cypherType.getSparkType, cypherType.isNullable) } StructType(id +: sourceId +: targetId +: properties) } } implicit class CanonicalTableExport(graph: RelationalCypherGraph[DataFrameTable]) { def canonicalNodeTable(labels: Set[String]): DataFrame = { val ct = CTNode(labels) val v = Var("n")(ct) val nodeRecords = graph.nodes(v.name, ct, exactLabelMatch = true) val header = nodeRecords.header val idRename = header.column(v) -> GraphElement.sourceIdKey val properties: Set[Property] = header.propertiesFor(v) val propertyRenames = properties.map { p => header.column(p) -> p.key.name.toPropertyColumnName } val selectColumns = (idRename :: propertyRenames.toList.sortBy(_._2)).map { case (oldName, newName) => nodeRecords.table.df.col(oldName).as(newName) } nodeRecords.table.df.select(selectColumns: _*) } def canonicalRelationshipTable(relType: String): DataFrame = { val ct = CTRelationship(relType) val v = Var("r")(ct) val relRecords = graph.relationships(v.name, ct) val header = relRecords.header val idRename = header.column(v) -> GraphElement.sourceIdKey val sourceIdRename = header.column(header.startNodeFor(v)) -> Relationship.sourceStartNodeKey val targetIdRename = header.column(header.endNodeFor(v)) -> Relationship.sourceEndNodeKey val properties: Set[Property] = relRecords.header.propertiesFor(v) val propertyRenames = properties.map { p => relRecords.header.column(p) -> p.key.name.toPropertyColumnName } val selectColumns = (idRename :: sourceIdRename :: targetIdRename :: propertyRenames.toList.sorted).map { case (oldName, newName) => relRecords.table.df.col(oldName).as(newName) } relRecords.table.df.select(selectColumns: _*) } } }
Example 14
Source File: BinaryFileReader.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark import com.microsoft.ml.spark.core.env.StreamUtilities import com.microsoft.ml.spark.core.schema.BinaryFileSchema import com.microsoft.ml.spark.core.utils.AsyncUtils import org.apache.commons.io.IOUtils import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} import org.apache.spark.binary.BinaryFileFormat import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.binary.ConfUtils import org.apache.spark.sql.types.BinaryType import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration.Duration object BinaryFileReader { private def recursePath(fileSystem: FileSystem, path: Path, pathFilter: FileStatus => Boolean, visitedSymlinks: Set[Path]): Array[Path] ={ val filteredPaths = fileSystem.listStatus(path).filter(pathFilter) val filteredDirs = filteredPaths.filter(fs => fs.isDirectory & !visitedSymlinks(fs.getPath)) val symlinksFound = visitedSymlinks ++ filteredDirs.filter(_.isSymlink).map(_.getPath) filteredPaths.map(_.getPath) ++ filteredDirs.map(_.getPath) .flatMap(p => recursePath(fileSystem, p, pathFilter, symlinksFound)) } def recursePath(fileSystem: FileSystem, path: Path, pathFilter: FileStatus => Boolean): Array[Path] ={ recursePath(fileSystem, path, pathFilter, Set()) } def readFromPaths(df: DataFrame, pathCol: String, bytesCol: String, concurrency: Int, timeout: Int ): DataFrame = { val outputSchema = df.schema.add(bytesCol, BinaryType, nullable = true) val encoder = RowEncoder(outputSchema) val hconf = ConfUtils.getHConf(df) df.mapPartitions { rows => val futures = rows.map {row: Row => Future { val path = new Path(row.getAs[String](pathCol)) val fs = path.getFileSystem(hconf.value) val bytes = StreamUtilities.using(fs.open(path)) {is => IOUtils.toByteArray(is)}.get val ret = Row.merge(Seq(row, Row(bytes)): _*) ret }(ExecutionContext.global) } AsyncUtils.bufferedAwait( futures,concurrency, Duration.fromNanos(timeout*(20^6).toLong))(ExecutionContext.global) }(encoder) } }
Example 15
Source File: BigQuerySource.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery.streaming import java.math.BigInteger import com.google.cloud.hadoop.io.bigquery.BigQueryStrings import com.samelamin.spark.bigquery.BigQueryClient import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.execution.streaming.{Offset, _} import org.apache.spark.sql.types.{BinaryType, StringType, StructField, StructType} import com.samelamin.spark.bigquery._ import com.samelamin.spark.bigquery.converters.SchemaConverters import org.joda.time.DateTime import org.slf4j.LoggerFactory override def getBatch(start: Option[Offset], end: Offset): DataFrame = { val startIndex = start.getOrElse(LongOffset(0L)).asInstanceOf[LongOffset].offset.toLong val endIndex = end.asInstanceOf[LongOffset].offset.toLong val startPartitionTime = new DateTime(startIndex).toLocalDate val endPartitionTime = new DateTime(endIndex).toLocalDate.toString logger.info(s"Fetching data between $startIndex and $endIndex") val query = s""" |SELECT | * |FROM | `${fullyQualifiedOutputTableId.replace(':','.')}` |WHERE | $timestampColumn BETWEEN TIMESTAMP_MILLIS($startIndex) AND TIMESTAMP_MILLIS($endIndex) | AND _PARTITIONTIME BETWEEN TIMESTAMP('$startPartitionTime') AND TIMESTAMP('$endPartitionTime') | """.stripMargin val bigQuerySQLContext = new BigQuerySQLContext(sqlContext) val df = bigQuerySQLContext.bigQuerySelect(query) df } override def stop(): Unit = {} def getConvertedSchema(sqlContext: SQLContext): StructType = { val bigqueryClient = BigQueryClient.getInstance(sqlContext) val tableReference = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId) SchemaConverters.BQToSQLSchema(bigqueryClient.getTableSchema(tableReference)) } } object BigQuerySource { val DEFAULT_SCHEMA = StructType( StructField("Sample Column", StringType) :: StructField("value", BinaryType) :: Nil ) }
Example 16
Source File: BinaryTypeBenchmark.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import java.sql.{Connection, DriverManager} import java.util.Properties import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import com.memsql.spark.BatchInsertBenchmark.{df, executeQuery} import org.apache.spark.sql.types.{BinaryType, IntegerType} import org.apache.spark.sql.{SaveMode, SparkSession} import scala.util.Random // BinaryTypeBenchmark is written to writing of the BinaryType with CPU profiler // this feature is accessible in Ultimate version of IntelliJ IDEA // see https://www.jetbrains.com/help/idea/async-profiler.html#profile for more details object BinaryTypeBenchmark extends App { final val masterHost: String = sys.props.getOrElse("memsql.host", "localhost") final val masterPort: String = sys.props.getOrElse("memsql.port", "5506") val spark: SparkSession = SparkSession .builder() .master("local") .config("spark.sql.shuffle.partitions", "1") .config("spark.driver.bindAddress", "localhost") .config("spark.datasource.memsql.ddlEndpoint", s"${masterHost}:${masterPort}") .config("spark.datasource.memsql.database", "testdb") .getOrCreate() def jdbcConnection: Loan[Connection] = { val connProperties = new Properties() connProperties.put("user", "root") Loan( DriverManager.getConnection( s"jdbc:mysql://$masterHost:$masterPort", connProperties )) } def executeQuery(sql: String): Unit = { jdbcConnection.to(conn => Loan(conn.createStatement).to(_.execute(sql))) } executeQuery("set global default_partitions_per_leaf = 2") executeQuery("drop database if exists testdb") executeQuery("create database testdb") def genRandomByte(): Byte = (Random.nextInt(256) - 128).toByte def genRandomRow(): Array[Byte] = Array.fill(1000)(genRandomByte()) val df = spark.createDF( List.fill(100000)(genRandomRow()).zipWithIndex, List(("data", BinaryType, true), ("id", IntegerType, true)) ) val start1 = System.nanoTime() df.write .format("memsql") .mode(SaveMode.Overwrite) .save("testdb.LoadData") println("Elapsed time: " + (System.nanoTime() - start1) + "ns [LoadData CSV]") val start2 = System.nanoTime() df.write .format("memsql") .option("tableKey.primary", "id") .option("onDuplicateKeySQL", "id = id") .mode(SaveMode.Overwrite) .save("testdb.BatchInsert") println("Elapsed time: " + (System.nanoTime() - start2) + "ns [BatchInsert]") val avroStart = System.nanoTime() df.write .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT) .mode(SaveMode.Overwrite) .option(MemsqlOptions.LOAD_DATA_FORMAT, "Avro") .save("testdb.AvroSerialization") println("Elapsed time: " + (System.nanoTime() - avroStart) + "ns [LoadData Avro] ") }
Example 17
Source File: EventHubsWriter.scala From azure-event-hubs-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.eventhubs import org.apache.spark.internal.Logging import org.apache.spark.sql.{ AnalysisException, SparkSession } import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.types.{ BinaryType, StringType } import org.apache.spark.util.Utils private[eventhubs] object EventHubsWriter extends Logging { val BodyAttributeName = "body" val PartitionKeyAttributeName = "partitionKey" val PartitionIdAttributeName = "partition" val PropertiesAttributeName = "properties" override def toString: String = "EventHubsWriter" private def validateQuery(schema: Seq[Attribute], parameters: Map[String, String]): Unit = { schema .find(_.name == BodyAttributeName) .getOrElse( throw new AnalysisException(s"Required attribute '$BodyAttributeName' not found.") ) .dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException( s"$BodyAttributeName attribute type " + s"must be a String or BinaryType.") } } def write( sparkSession: SparkSession, queryExecution: QueryExecution, parameters: Map[String, String] ): Unit = { val schema = queryExecution.analyzed.output validateQuery(schema, parameters) queryExecution.toRdd.foreachPartition { iter => val writeTask = new EventHubsWriteTask(parameters, schema) Utils.tryWithSafeFinally(block = writeTask.execute(iter))( finallyBlock = writeTask.close() ) } } }