org.apache.parquet.io.api.Binary Scala Examples

The following examples show how to use org.apache.parquet.io.api.Binary. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: JsonLogicalType.scala    From embulk-output-s3_parquet   with MIT License 5 votes vote down vote up
package org.embulk.output.s3_parquet.parquet
import org.apache.parquet.io.api.{Binary, RecordConsumer}
import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types}
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
import org.embulk.config.ConfigException
import org.embulk.output.s3_parquet.catalog.GlueDataType
import org.embulk.spi.Column
import org.embulk.spi.`type`.{
  BooleanType,
  DoubleType,
  JsonType,
  LongType,
  StringType,
  TimestampType
}
import org.embulk.spi.time.{Timestamp, TimestampFormatter}
import org.msgpack.value.{Value, ValueFactory}
import org.slf4j.{Logger, LoggerFactory}

object JsonLogicalType extends ParquetColumnType {
  private val logger: Logger = LoggerFactory.getLogger(JsonLogicalType.getClass)
  override def primitiveType(column: Column): PrimitiveType =
    column.getType match {
      case _: BooleanType | _: LongType | _: DoubleType | _: StringType |
          _: JsonType =>
        Types
          .optional(PrimitiveTypeName.BINARY)
          .as(LogicalTypeAnnotation.jsonType())
          .named(column.getName)
      case _: TimestampType | _ =>
        throw new ConfigException(s"Unsupported column type: ${column.getName}")
    }

  override def glueDataType(column: Column): GlueDataType =
    column.getType match {
      case _: BooleanType | _: LongType | _: DoubleType | _: StringType |
          _: JsonType =>
        warningWhenConvertingJsonToGlueType(GlueDataType.STRING)
        GlueDataType.STRING
      case _: TimestampType | _ =>
        throw new ConfigException(s"Unsupported column type: ${column.getName}")
    }

  override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit =
    consumeJson(consumer, ValueFactory.newBoolean(v))

  override def consumeString(consumer: RecordConsumer, v: String): Unit =
    consumeJson(consumer, ValueFactory.newString(v))

  override def consumeLong(consumer: RecordConsumer, v: Long): Unit =
    consumeJson(consumer, ValueFactory.newInteger(v))

  override def consumeDouble(consumer: RecordConsumer, v: Double): Unit =
    consumeJson(consumer, ValueFactory.newFloat(v))

  override def consumeTimestamp(
      consumer: RecordConsumer,
      v: Timestamp,
      formatter: TimestampFormatter
  ): Unit = throw newUnsupportedMethodException("consumeTimestamp")

  override def consumeJson(consumer: RecordConsumer, v: Value): Unit =
    consumer.addBinary(Binary.fromString(v.toJson))

  private def warningWhenConvertingJsonToGlueType(
      glueType: GlueDataType
  ): Unit = {
    logger.warn(
      s"json is converted" +
        s" to Glue ${glueType.name} but this is not represented correctly, because Glue" +
        s" does not support json type. Please use `catalog.column_options` to define the type."
    )
  }

} 
Example 2
Source File: DefaultColumnType.scala    From embulk-output-s3_parquet   with MIT License 5 votes vote down vote up
package org.embulk.output.s3_parquet.parquet

import org.apache.parquet.io.api.{Binary, RecordConsumer}
import org.apache.parquet.schema.{LogicalTypeAnnotation, PrimitiveType, Types}
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
import org.embulk.config.ConfigException
import org.embulk.output.s3_parquet.catalog.GlueDataType
import org.embulk.spi.time.{Timestamp, TimestampFormatter}
import org.embulk.spi.Column
import org.embulk.spi.`type`.{
  BooleanType,
  DoubleType,
  JsonType,
  LongType,
  StringType,
  TimestampType
}
import org.msgpack.value.Value

object DefaultColumnType extends ParquetColumnType {
  override def primitiveType(column: Column): PrimitiveType =
    column.getType match {
      case _: BooleanType =>
        Types.optional(PrimitiveTypeName.BOOLEAN).named(column.getName)
      case _: LongType =>
        Types.optional(PrimitiveTypeName.INT64).named(column.getName)
      case _: DoubleType =>
        Types.optional(PrimitiveTypeName.DOUBLE).named(column.getName)
      case _: StringType =>
        Types
          .optional(PrimitiveTypeName.BINARY)
          .as(LogicalTypeAnnotation.stringType())
          .named(column.getName)
      case _: TimestampType =>
        Types
          .optional(PrimitiveTypeName.BINARY)
          .as(LogicalTypeAnnotation.stringType())
          .named(column.getName)
      case _: JsonType =>
        Types
          .optional(PrimitiveTypeName.BINARY)
          .as(LogicalTypeAnnotation.stringType())
          .named(column.getName)
      case _ =>
        throw new ConfigException(s"Unsupported column type: ${column.getName}")
    }

  override def glueDataType(column: Column): GlueDataType =
    column.getType match {
      case _: BooleanType =>
        GlueDataType.BOOLEAN
      case _: LongType =>
        GlueDataType.BIGINT
      case _: DoubleType =>
        GlueDataType.DOUBLE
      case _: StringType | _: TimestampType | _: JsonType =>
        GlueDataType.STRING
      case _ =>
        throw new ConfigException(s"Unsupported column type: ${column.getName}")
    }

  override def consumeBoolean(consumer: RecordConsumer, v: Boolean): Unit =
    consumer.addBoolean(v)
  override def consumeString(consumer: RecordConsumer, v: String): Unit =
    consumer.addBinary(Binary.fromString(v))
  override def consumeLong(consumer: RecordConsumer, v: Long): Unit =
    consumer.addLong(v)
  override def consumeDouble(consumer: RecordConsumer, v: Double): Unit =
    consumer.addDouble(v)
  override def consumeTimestamp(
      consumer: RecordConsumer,
      v: Timestamp,
      formatter: TimestampFormatter
  ): Unit = consumer.addBinary(Binary.fromString(formatter.format(v)))
  override def consumeJson(consumer: RecordConsumer, v: Value): Unit =
    consumer.addBinary(Binary.fromString(v.toJson))
} 
Example 3
Source File: MockParquetRecordConsumer.scala    From embulk-output-s3_parquet   with MIT License 5 votes vote down vote up
package org.embulk.output.s3_parquet.parquet

import org.apache.parquet.io.api.{Binary, RecordConsumer}

case class MockParquetRecordConsumer() extends RecordConsumer {
  case class Data private (messages: Seq[Message] = Seq()) {
    def toData: Seq[Seq[Any]] = messages.map(_.toData)
  }
  case class Message private (fields: Seq[Field] = Seq()) {
    def toData: Seq[Any] = {
      val maxIndex: Int = fields.maxBy(_.index).index
      val raw: Map[Int, Any] = fields.map(f => f.index -> f.value).toMap
      0.to(maxIndex).map(idx => raw.get(idx).orNull)
    }
  }
  case class Field private (index: Int = 0, value: Any = null)

  private var _data: Data = Data()
  private var _message: Message = Message()
  private var _field: Field = Field()

  override def startMessage(): Unit = _message = Message()
  override def endMessage(): Unit =
    _data = _data.copy(messages = _data.messages :+ _message)
  override def startField(field: String, index: Int): Unit =
    _field = Field(index = index)
  override def endField(field: String, index: Int): Unit =
    _message = _message.copy(fields = _message.fields :+ _field)
  override def startGroup(): Unit = throw new UnsupportedOperationException
  override def endGroup(): Unit = throw new UnsupportedOperationException
  override def addInteger(value: Int): Unit =
    _field = _field.copy(value = value)
  override def addLong(value: Long): Unit = _field = _field.copy(value = value)
  override def addBoolean(value: Boolean): Unit =
    _field = _field.copy(value = value)
  override def addBinary(value: Binary): Unit =
    _field = _field.copy(value = value)
  override def addFloat(value: Float): Unit =
    _field = _field.copy(value = value)
  override def addDouble(value: Double): Unit =
    _field = _field.copy(value = value)

  def writingMessage(f: => Unit): Unit = {
    startMessage()
    f
    endMessage()
  }
  def writingField(field: String, index: Int)(f: => Unit): Unit = {
    startField(field, index)
    f
    endField(field, index)
  }
  def writingSampleField(f: => Unit): Unit = {
    writingMessage {
      writingField("a", 0)(f)
    }
  }
  def data: Seq[Seq[Any]] = _data.toData
} 
Example 4
Source File: CustomType.scala    From parquet4s   with MIT License 5 votes vote down vote up
package com.github.mjakubowski84.parquet4s

import com.github.mjakubowski84.parquet4s.ParquetSchemaResolver.{TypedSchemaDef, typedSchemaDef}
import org.apache.parquet.io.api.Binary
import org.apache.parquet.schema.{OriginalType, PrimitiveType}

import scala.util.Random

object CustomType {

  object Dict {

    sealed trait Type
    case object A extends Type
    case object B extends Type
    case object C extends Type
    case object D extends Type

    val values: List[Type] = List(A, B, C, D)
    def valueOf(name: String): Type = values.find(_.toString == name)
      .getOrElse(throw new IllegalArgumentException(s"Invalid dict name: $name"))

    def random: Type = values(Random.nextInt(values.length))

    // required for reading and writing
    implicit val codec: OptionalValueCodec[Type] = new OptionalValueCodec[Type] {
      override protected def decodeNonNull(value: Value, configuration: ValueCodecConfiguration): Type = value match {
        case BinaryValue(binary) => valueOf(binary.toStringUsingUTF8)
      }
      override protected def encodeNonNull(data: Type, configuration: ValueCodecConfiguration): Value =
        BinaryValue(Binary.fromString(data.toString))
    }

    // required for writing only
    implicit val schema: TypedSchemaDef[Type] =
      typedSchemaDef[Type](
        PrimitiveSchemaDef(
          primitiveType = PrimitiveType.PrimitiveTypeName.BINARY,
          required = false,
          originalType = Some(OriginalType.UTF8)
        )
      )
  }

} 
Example 5
Source File: Decimals.scala    From parquet4s   with MIT License 5 votes vote down vote up
package com.github.mjakubowski84.parquet4s

import java.math.MathContext
import java.nio.ByteBuffer

import org.apache.parquet.io.api.Binary

object Decimals {
  val Scale = 18
  val Precision = 38
  val ByteArrayLength = 16
  val MathContext = new MathContext(Precision)

  private def rescale(original: BigDecimal): BigDecimal = {
    if (original.scale == Scale && original.mc == MathContext) original
    else BigDecimal.decimal(original.bigDecimal, MathContext).setScale(Scale, BigDecimal.RoundingMode.HALF_UP)
  }

  def rescaleBinary(binary: Binary, originalScale: Int, originalMathContext: MathContext): Binary =
    binaryFromDecimal(decimalFromBinary(binary, originalScale, originalMathContext))

  def decimalFromBinary(binary: Binary, scale: Int = Scale, mathContext: MathContext = MathContext): BigDecimal =
    BigDecimal(BigInt(binary.getBytes), scale, mathContext)

  def binaryFromDecimal(decimal: BigDecimal): Binary = {
    
    val buf = ByteBuffer.allocate(ByteArrayLength)
    val unscaled = rescale(decimal).bigDecimal.unscaledValue().toByteArray
    // BigInteger is stored in tail of byte array, sign is stored in unoccupied cells
    val sign: Byte = if (unscaled.head < 0) -1 else 0
    (0 until ByteArrayLength - unscaled.length).foreach(_ => buf.put(sign))
    buf.put(unscaled)
    Binary.fromReusedByteArray(buf.array())
  }
} 
Example 6
Source File: TimestampPrimitiveConverter.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import jodd.datetime.{JDateTime, JulianDateStamp}
import org.apache.kafka.connect.data.Field
import org.apache.parquet.example.data.simple.NanoTime
import org.apache.parquet.io.api.{Binary, PrimitiveConverter}

// see https://issues.apache.org/jira/browse/HIVE-6394 and
// https://issues.apache.org/jira/browse/SPARK-10177 for compile ideas
class TimestampPrimitiveConverter(field: Field, builder: scala.collection.mutable.Map[String, Any]) extends PrimitiveConverter {

  private val nanosInDay = BigDecimal(60 * 60 * 24) * 1000 * 1000 * 1000
  private val offset = nanosInDay / 2

  override def addBinary(x: Binary): Unit = {
    val nano = NanoTime.fromBinary(x)
    val jdt = new JDateTime()
    val f = (BigDecimal(nano.getTimeOfDayNanos) - offset) / nanosInDay
    jdt.setJulianDate(new JulianDateStamp(nano.getJulianDay, f.doubleValue()))
    builder.put(field.name, jdt.convertToSqlTimestamp)
  }
} 
Example 7
Source File: StructWriteSupport.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive._
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext
import org.apache.parquet.io.api.{Binary, RecordConsumer}
import org.apache.parquet.schema.MessageType

import scala.collection.JavaConverters._

// derived from Apache Spark's parquet write support, archive and license here:
// https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)
  private val schemaName = if (schema.name() == null) "schema" else schema.name()
  private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName)

  private val metadata = new java.util.HashMap[String, String]()
  metadata.put("written_by", "streamreactor")

  // The Parquet `RecordConsumer` to which all structs are written
  private var consumer: RecordConsumer = _

  type ValueWriter = (Any) => Unit

  override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String])
  override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata)
  override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer

  override def write(struct: Struct): Unit = {
    writeMessage {
      writeStructFields(struct)
    }
  }

  private def writeStructFields(struct: Struct): Unit = {
    for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) {
      val value = struct.get(field)
      if (value != null) {
        val writer = valueWriter(field.schema())
        writeField(field.name, index) {
          writer(value)
        }
      }
    }
  }

  def valueWriter(schema: Schema): ValueWriter = {
    // todo perhaps introduce something like spark's SpecializedGetters
    schema.`type`() match {
      case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean])
      case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt)
      case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong)
      case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes))
      case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat)
      case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble)
      case Schema.Type.STRUCT => value => {
        logger.debug(s"Writing nested struct")
        val struct = value.asInstanceOf[Struct]
        writeGroup {
          schema.fields.asScala
            .map { field => field -> struct.get(field) }
            .zipWithIndex.foreach { case ((field, v), k) =>
            writeField(field.name, k) {
              valueWriter(field.schema)(v)
            }
          }
        }
      }
      case _ => throw UnsupportedSchemaType(schema.`type`.toString)
    }
  }

  private def writeMessage(f: => Unit): Unit = {
    consumer.startMessage()
    f
    consumer.endMessage()
  }

  private def writeGroup(f: => Unit): Unit = {
    consumer.startGroup()
    // consumer.startMessage()
    f
    //consumer.endMessage()
    consumer.endGroup()
  }

  private def writeField(name: String, k: Int)(f: => Unit): Unit = {
    consumer.startField(name, k)
    f
    consumer.endField(name, k)
  }
} 
Example 8
Source File: TimestampPrimitiveConverter.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import jodd.datetime.{JDateTime, JulianDateStamp}
import org.apache.kafka.connect.data.Field
import org.apache.parquet.example.data.simple.NanoTime
import org.apache.parquet.io.api.{Binary, PrimitiveConverter}

// see https://issues.apache.org/jira/browse/HIVE-6394 and
// https://issues.apache.org/jira/browse/SPARK-10177 for compile ideas
class TimestampPrimitiveConverter(field: Field, builder: scala.collection.mutable.Map[String, Any]) extends PrimitiveConverter {

  private val nanosInDay = BigDecimal(60 * 60 * 24) * 1000 * 1000 * 1000
  private val offset = nanosInDay / 2

  override def addBinary(x: Binary): Unit = {
    val nano = NanoTime.fromBinary(x)
    val jdt = new JDateTime()
    val f = (BigDecimal(nano.getTimeOfDayNanos) - offset) / nanosInDay
    jdt.setJulianDate(new JulianDateStamp(nano.getJulianDay, f.doubleValue()))
    builder.put(field.name, jdt.convertToSqlTimestamp)
  }
} 
Example 9
Source File: StructWriteSupport.scala    From stream-reactor   with Apache License 2.0 5 votes vote down vote up
package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive._
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext
import org.apache.parquet.io.api.{Binary, RecordConsumer}
import org.apache.parquet.schema.MessageType

import scala.collection.JavaConverters._

// derived from Apache Spark's parquet write support, archive and license here:
// https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)
  private val schemaName = if (schema.name() == null) "schema" else schema.name()
  private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName)

  private val metadata = new java.util.HashMap[String, String]()
  metadata.put("written_by", "streamreactor")

  // The Parquet `RecordConsumer` to which all structs are written
  private var consumer: RecordConsumer = _

  type ValueWriter = (Any) => Unit

  override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String])
  override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata)
  override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer

  override def write(struct: Struct): Unit = {
    writeMessage {
      writeStructFields(struct)
    }
  }

  private def writeStructFields(struct: Struct): Unit = {
    for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) {
      val value = struct.get(field)
      if (value != null) {
        val writer = valueWriter(field.schema())
        writeField(field.name, index) {
          writer(value)
        }
      }
    }
  }

  def valueWriter(schema: Schema): ValueWriter = {
    // todo perhaps introduce something like spark's SpecializedGetters
    schema.`type`() match {
      case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean])
      case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt)
      case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong)
      case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes))
      case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat)
      case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble)
      case Schema.Type.STRUCT => value => {
        logger.debug(s"Writing nested struct")
        val struct = value.asInstanceOf[Struct]
        writeGroup {
          schema.fields.asScala
            .map { field => field -> struct.get(field) }
            .zipWithIndex.foreach { case ((field, v), k) =>
            writeField(field.name, k) {
              valueWriter(field.schema)(v)
            }
          }
        }
      }
      case _ => throw UnsupportedSchemaType(schema.`type`.toString)
    }
  }

  private def writeMessage(f: => Unit): Unit = {
    consumer.startMessage()
    f
    consumer.endMessage()
  }

  private def writeGroup(f: => Unit): Unit = {
    consumer.startGroup()
    // consumer.startMessage()
    f
    //consumer.endMessage()
    consumer.endGroup()
  }

  private def writeField(name: String, k: Int)(f: => Unit): Unit = {
    consumer.startField(name, k)
    f
    consumer.endField(name, k)
  }
}