org.apache.parquet.io.api.Binary Scala Examples
The following examples show how to use org.apache.parquet.io.api.Binary.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: JsonLogicalType.scala From embulk-output-s3_parquet with MIT License | 5 votes |
package org.embulk.output.s3_parquet.parquet import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types} import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName import org.embulk.config.ConfigException import org.embulk.output.s3_parquet.catalog.GlueDataType import org.embulk.spi.Column import org.embulk.spi.`type`.{ BooleanType, DoubleType, JsonType, LongType, StringType, TimestampType } import org.embulk.spi.time.{Timestamp, TimestampFormatter} import org.msgpack.value.{Value, ValueFactory} import org.slf4j.{Logger, LoggerFactory} object JsonLogicalType extends ParquetColumnType { private val logger: Logger = LoggerFactory.getLogger(JsonLogicalType.getClass) override def primitiveType(column: Column): PrimitiveType = column.getType match { case _: BooleanType | _: LongType | _: DoubleType | _: StringType | _: JsonType => Types .optional(PrimitiveTypeName.BINARY) .as(LogicalTypeAnnotation.jsonType()) .named(column.getName) case _: TimestampType | _ => throw new ConfigException(s"Unsupported column type: ${column.getName}") } override def glueDataType(column: Column): GlueDataType = column.getType match { case _: BooleanType | _: LongType | _: DoubleType | _: StringType | _: JsonType => warningWhenConvertingJsonToGlueType(GlueDataType.STRING) GlueDataType.STRING case _: TimestampType | _ => throw new ConfigException(s"Unsupported column type: ${column.getName}") } override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit = consumeJson(consumer, ValueFactory.newBoolean(v)) override def consumeString(consumer: RecordConsumer, v: String): Unit = consumeJson(consumer, ValueFactory.newString(v)) override def consumeLong(consumer: RecordConsumer, v: Long): Unit = consumeJson(consumer, ValueFactory.newInteger(v)) override def consumeDouble(consumer: RecordConsumer, v: Double): Unit = consumeJson(consumer, ValueFactory.newFloat(v)) override def consumeTimestamp( consumer: RecordConsumer, v: Timestamp, formatter: TimestampFormatter ): Unit = throw newUnsupportedMethodException("consumeTimestamp") override def consumeJson(consumer: RecordConsumer, v: Value): Unit = consumer.addBinary(Binary.fromString(v.toJson)) private def warningWhenConvertingJsonToGlueType( glueType: GlueDataType ): Unit = { logger.warn( s"json is converted" + s" to Glue ${glueType.name} but this is not represented correctly, because Glue" + s" does not support json type. Please use `catalog.column_options` to define the type." ) } }
Example 2
Source File: DefaultColumnType.scala From embulk-output-s3_parquet with MIT License | 5 votes |
package org.embulk.output.s3_parquet.parquet import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types} import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName import org.embulk.config.ConfigException import org.embulk.output.s3_parquet.catalog.GlueDataType import org.embulk.spi.time.{Timestamp, TimestampFormatter} import org.embulk.spi.Column import org.embulk.spi.`type`.{ BooleanType, DoubleType, JsonType, LongType, StringType, TimestampType } import org.msgpack.value.Value object DefaultColumnType extends ParquetColumnType { override def primitiveType(column: Column): PrimitiveType = column.getType match { case _: BooleanType => Types.optional(PrimitiveTypeName.BOOLEAN).named(column.getName) case _: LongType => Types.optional(PrimitiveTypeName.INT64).named(column.getName) case _: DoubleType => Types.optional(PrimitiveTypeName.DOUBLE).named(column.getName) case _: StringType => Types .optional(PrimitiveTypeName.BINARY) .as(LogicalTypeAnnotation.stringType()) .named(column.getName) case _: TimestampType => Types .optional(PrimitiveTypeName.BINARY) .as(LogicalTypeAnnotation.stringType()) .named(column.getName) case _: JsonType => Types .optional(PrimitiveTypeName.BINARY) .as(LogicalTypeAnnotation.stringType()) .named(column.getName) case _ => throw new ConfigException(s"Unsupported column type: ${column.getName}") } override def glueDataType(column: Column): GlueDataType = column.getType match { case _: BooleanType => GlueDataType.BOOLEAN case _: LongType => GlueDataType.BIGINT case _: DoubleType => GlueDataType.DOUBLE case _: StringType | _: TimestampType | _: JsonType => GlueDataType.STRING case _ => throw new ConfigException(s"Unsupported column type: ${column.getName}") } override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit = consumer.addBoolean(v) override def consumeString(consumer: RecordConsumer, v: String): Unit = consumer.addBinary(Binary.fromString(v)) override def consumeLong(consumer: RecordConsumer, v: Long): Unit = consumer.addLong(v) override def consumeDouble(consumer: RecordConsumer, v: Double): Unit = consumer.addDouble(v) override def consumeTimestamp( consumer: RecordConsumer, v: Timestamp, formatter: TimestampFormatter ): Unit = consumer.addBinary(Binary.fromString(formatter.format(v))) override def consumeJson(consumer: RecordConsumer, v: Value): Unit = consumer.addBinary(Binary.fromString(v.toJson)) }
Example 3
Source File: MockParquetRecordConsumer.scala From embulk-output-s3_parquet with MIT License | 5 votes |
package org.embulk.output.s3_parquet.parquet import org.apache.parquet.io.api.{Binary, RecordConsumer} case class MockParquetRecordConsumer() extends RecordConsumer { case class Data private (messages: Seq[Message] = Seq()) { def toData: Seq[Seq[Any]] = messages.map(_.toData) } case class Message private (fields: Seq[Field] = Seq()) { def toData: Seq[Any] = { val maxIndex: Int = fields.maxBy(_.index).index val raw: Map[Int, Any] = fields.map(f => f.index -> f.value).toMap 0.to(maxIndex).map(idx => raw.get(idx).orNull) } } case class Field private (index: Int = 0, value: Any = null) private var _data: Data = Data() private var _message: Message = Message() private var _field: Field = Field() override def startMessage(): Unit = _message = Message() override def endMessage(): Unit = _data = _data.copy(messages = _data.messages :+ _message) override def startField(field: String, index: Int): Unit = _field = Field(index = index) override def endField(field: String, index: Int): Unit = _message = _message.copy(fields = _message.fields :+ _field) override def startGroup(): Unit = throw new UnsupportedOperationException override def endGroup(): Unit = throw new UnsupportedOperationException override def addInteger(value: Int): Unit = _field = _field.copy(value = value) override def addLong(value: Long): Unit = _field = _field.copy(value = value) override def addBoolean(value: Boolean): Unit = _field = _field.copy(value = value) override def addBinary(value: Binary): Unit = _field = _field.copy(value = value) override def addFloat(value: Float): Unit = _field = _field.copy(value = value) override def addDouble(value: Double): Unit = _field = _field.copy(value = value) def writingMessage(f: => Unit): Unit = { startMessage() f endMessage() } def writingField(field: String, index: Int)(f: => Unit): Unit = { startField(field, index) f endField(field, index) } def writingSampleField(f: => Unit): Unit = { writingMessage { writingField("a", 0)(f) } } def data: Seq[Seq[Any]] = _data.toData }
Example 4
Source File: CustomType.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import com.github.mjakubowski84.parquet4s.ParquetSchemaResolver.{TypedSchemaDef, typedSchemaDef} import org.apache.parquet.io.api.Binary import org.apache.parquet.schema.{OriginalType, PrimitiveType} import scala.util.Random object CustomType { object Dict { sealed trait Type case object A extends Type case object B extends Type case object C extends Type case object D extends Type val values: List[Type] = List(A, B, C, D) def valueOf(name: String): Type = values.find(_.toString == name) .getOrElse(throw new IllegalArgumentException(s"Invalid dict name: $name")) def random: Type = values(Random.nextInt(values.length)) // required for reading and writing implicit val codec: OptionalValueCodec[Type] = new OptionalValueCodec[Type] { override protected def decodeNonNull(value: Value, configuration: ValueCodecConfiguration): Type = value match { case BinaryValue(binary) => valueOf(binary.toStringUsingUTF8) } override protected def encodeNonNull(data: Type, configuration: ValueCodecConfiguration): Value = BinaryValue(Binary.fromString(data.toString)) } // required for writing only implicit val schema: TypedSchemaDef[Type] = typedSchemaDef[Type]( PrimitiveSchemaDef( primitiveType = PrimitiveType.PrimitiveTypeName.BINARY, required = false, originalType = Some(OriginalType.UTF8) ) ) } }
Example 5
Source File: Decimals.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import java.math.MathContext import java.nio.ByteBuffer import org.apache.parquet.io.api.Binary object Decimals { val Scale = 18 val Precision = 38 val ByteArrayLength = 16 val MathContext = new MathContext(Precision) private def rescale(original: BigDecimal): BigDecimal = { if (original.scale == Scale && original.mc == MathContext) original else BigDecimal.decimal(original.bigDecimal, MathContext).setScale(Scale, BigDecimal.RoundingMode.HALF_UP) } def rescaleBinary(binary: Binary, originalScale: Int, originalMathContext: MathContext): Binary = binaryFromDecimal(decimalFromBinary(binary, originalScale, originalMathContext)) def decimalFromBinary(binary: Binary, scale: Int = Scale, mathContext: MathContext = MathContext): BigDecimal = BigDecimal(BigInt(binary.getBytes), scale, mathContext) def binaryFromDecimal(decimal: BigDecimal): Binary = { val buf = ByteBuffer.allocate(ByteArrayLength) val unscaled = rescale(decimal).bigDecimal.unscaledValue().toByteArray // BigInteger is stored in tail of byte array, sign is stored in unoccupied cells val sign: Byte = if (unscaled.head < 0) -1 else 0 (0 until ByteArrayLength - unscaled.length).foreach(_ => buf.put(sign)) buf.put(unscaled) Binary.fromReusedByteArray(buf.array()) } }
Example 6
Source File: TimestampPrimitiveConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import jodd.datetime.{JDateTime, JulianDateStamp} import org.apache.kafka.connect.data.Field import org.apache.parquet.example.data.simple.NanoTime import org.apache.parquet.io.api.{Binary, PrimitiveConverter} // see https://issues.apache.org/jira/browse/HIVE-6394 and // https://issues.apache.org/jira/browse/SPARK-10177 for compile ideas class TimestampPrimitiveConverter(field: Field, builder: scala.collection.mutable.Map[String, Any]) extends PrimitiveConverter { private val nanosInDay = BigDecimal(60 * 60 * 24) * 1000 * 1000 * 1000 private val offset = nanosInDay / 2 override def addBinary(x: Binary): Unit = { val nano = NanoTime.fromBinary(x) val jdt = new JDateTime() val f = (BigDecimal(nano.getTimeOfDayNanos) - offset) / nanosInDay jdt.setJulianDate(new JulianDateStamp(nano.getJulianDay, f.doubleValue())) builder.put(field.name, jdt.convertToSqlTimestamp) } }
Example 7
Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive._ import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.MessageType import scala.collection.JavaConverters._ // derived from Apache Spark's parquet write support, archive and license here: // https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) private val schemaName = if (schema.name() == null) "schema" else schema.name() private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName) private val metadata = new java.util.HashMap[String, String]() metadata.put("written_by", "streamreactor") // The Parquet `RecordConsumer` to which all structs are written private var consumer: RecordConsumer = _ type ValueWriter = (Any) => Unit override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String]) override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata) override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer override def write(struct: Struct): Unit = { writeMessage { writeStructFields(struct) } } private def writeStructFields(struct: Struct): Unit = { for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) { val value = struct.get(field) if (value != null) { val writer = valueWriter(field.schema()) writeField(field.name, index) { writer(value) } } } } def valueWriter(schema: Schema): ValueWriter = { // todo perhaps introduce something like spark's SpecializedGetters schema.`type`() match { case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean]) case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt) case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong) case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes)) case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat) case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble) case Schema.Type.STRUCT => value => { logger.debug(s"Writing nested struct") val struct = value.asInstanceOf[Struct] writeGroup { schema.fields.asScala .map { field => field -> struct.get(field) } .zipWithIndex.foreach { case ((field, v), k) => writeField(field.name, k) { valueWriter(field.schema)(v) } } } } case _ => throw UnsupportedSchemaType(schema.`type`.toString) } } private def writeMessage(f: => Unit): Unit = { consumer.startMessage() f consumer.endMessage() } private def writeGroup(f: => Unit): Unit = { consumer.startGroup() // consumer.startMessage() f //consumer.endMessage() consumer.endGroup() } private def writeField(name: String, k: Int)(f: => Unit): Unit = { consumer.startField(name, k) f consumer.endField(name, k) } }
Example 8
Source File: TimestampPrimitiveConverter.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import jodd.datetime.{JDateTime, JulianDateStamp} import org.apache.kafka.connect.data.Field import org.apache.parquet.example.data.simple.NanoTime import org.apache.parquet.io.api.{Binary, PrimitiveConverter} // see https://issues.apache.org/jira/browse/HIVE-6394 and // https://issues.apache.org/jira/browse/SPARK-10177 for compile ideas class TimestampPrimitiveConverter(field: Field, builder: scala.collection.mutable.Map[String, Any]) extends PrimitiveConverter { private val nanosInDay = BigDecimal(60 * 60 * 24) * 1000 * 1000 * 1000 private val offset = nanosInDay / 2 override def addBinary(x: Binary): Unit = { val nano = NanoTime.fromBinary(x) val jdt = new JDateTime() val f = (BigDecimal(nano.getTimeOfDayNanos) - offset) / nanosInDay jdt.setJulianDate(new JulianDateStamp(nano.getJulianDay, f.doubleValue())) builder.put(field.name, jdt.convertToSqlTimestamp) } }
Example 9
Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive._ import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.MessageType import scala.collection.JavaConverters._ // derived from Apache Spark's parquet write support, archive and license here: // https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) private val schemaName = if (schema.name() == null) "schema" else schema.name() private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName) private val metadata = new java.util.HashMap[String, String]() metadata.put("written_by", "streamreactor") // The Parquet `RecordConsumer` to which all structs are written private var consumer: RecordConsumer = _ type ValueWriter = (Any) => Unit override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String]) override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata) override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer override def write(struct: Struct): Unit = { writeMessage { writeStructFields(struct) } } private def writeStructFields(struct: Struct): Unit = { for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) { val value = struct.get(field) if (value != null) { val writer = valueWriter(field.schema()) writeField(field.name, index) { writer(value) } } } } def valueWriter(schema: Schema): ValueWriter = { // todo perhaps introduce something like spark's SpecializedGetters schema.`type`() match { case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean]) case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt) case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong) case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes)) case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat) case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble) case Schema.Type.STRUCT => value => { logger.debug(s"Writing nested struct") val struct = value.asInstanceOf[Struct] writeGroup { schema.fields.asScala .map { field => field -> struct.get(field) } .zipWithIndex.foreach { case ((field, v), k) => writeField(field.name, k) { valueWriter(field.schema)(v) } } } } case _ => throw UnsupportedSchemaType(schema.`type`.toString) } } private def writeMessage(f: => Unit): Unit = { consumer.startMessage() f consumer.endMessage() } private def writeGroup(f: => Unit): Unit = { consumer.startGroup() // consumer.startMessage() f //consumer.endMessage() consumer.endGroup() } private def writeField(name: String, k: Int)(f: => Unit): Unit = { consumer.startField(name, k) f consumer.endField(name, k) } }