org.apache.parquet.schema.MessageType Scala Examples
The following examples show how to use org.apache.parquet.schema.MessageType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: WriteAndReadGenericApp.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.core import java.time.{LocalDate, ZoneOffset} import java.util.TimeZone import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, RowParquetRecord, ValueCodecConfiguration} import com.google.common.io.Files import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, INT32, INT64} import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED} import org.apache.parquet.schema.{MessageType, OriginalType, Types} object WriteAndReadGenericApp extends App { val ID = "id" val Name = "name" val Birthday = "birthday" val Schema = "user_schema" val path = Files.createTempDir().getAbsolutePath val vcc = ValueCodecConfiguration(TimeZone.getTimeZone(ZoneOffset.UTC)) val users = List( (1L, "Alice", LocalDate.of(2000, 1, 1)), (2L, "Bob", LocalDate.of(1980, 2, 28)), (3L, "Cecilia", LocalDate.of(1977, 3, 15)) ).map { case (id, name, birthday) => RowParquetRecord.empty .add(ID, id, vcc) .add(Name, name, vcc) .add(Birthday, birthday, vcc) } // write implicit val schema: MessageType = Types.buildMessage() .addField(Types.primitive(INT64, REQUIRED).as(OriginalType.INT_64).named(ID)) .addField(Types.primitive(BINARY, OPTIONAL).as(OriginalType.UTF8).named(Name)) .addField(Types.primitive(INT32, OPTIONAL).as(OriginalType.DATE).named(Birthday)) .named(Schema) ParquetWriter.writeAndClose(s"$path/users.parquet", users) //read val readData = ParquetReader.read[RowParquetRecord](path) try { readData.foreach { record => val id = record.get[Long](ID, vcc) val name = record.get[String](Name, vcc) val birthday = record.get[LocalDate](Birthday, vcc) println(s"User[$ID=$id,$Name=$name,$Birthday=$birthday]") } } finally readData.close() }
Example 2
Source File: UnorderedParallelParquetSink.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import java.util.UUID import akka.Done import akka.stream.scaladsl.{Flow, Keep, Sink} import org.apache.hadoop.fs.Path import org.apache.parquet.schema.MessageType import org.slf4j.{Logger, LoggerFactory} import scala.concurrent.Future private[parquet4s] object UnorderedParallelParquetSink extends IOOps { protected val logger: Logger = LoggerFactory.getLogger(this.getClass) def apply[T: ParquetRecordEncoder : ParquetSchemaResolver](path: Path, parallelism: Int, options: ParquetWriter.Options = ParquetWriter.Options() ): Sink[T, Future[Done]] = { val schema = ParquetSchemaResolver.resolveSchema[T] val valueCodecConfiguration = options.toValueCodecConfiguration validateWritePath(path, options) def encode(data: T): RowParquetRecord = ParquetRecordEncoder.encode[T](data, valueCodecConfiguration) Flow[T] .zipWithIndex .groupBy(parallelism, elemAndIndex => Math.floorMod(elemAndIndex._2, parallelism)) .map(elemAndIndex => encode(elemAndIndex._1)) .fold(UnorderedChunk(path, schema, options))(_.write(_)) .map(_.close()) .async .mergeSubstreamsWithParallelism(parallelism) .toMat(Sink.ignore)(Keep.right) } private trait UnorderedChunk { def write(record: RowParquetRecord): UnorderedChunk def close(): Unit } private object UnorderedChunk { def apply(basePath: Path, schema: MessageType, options: ParquetWriter.Options): UnorderedChunk = new PendingUnorderedChunk(basePath, schema, options) private[UnorderedChunk] class PendingUnorderedChunk(basePath: Path, schema: MessageType, options: ParquetWriter.Options) extends UnorderedChunk { override def write(record: RowParquetRecord): UnorderedChunk = { val chunkPath = Path.mergePaths(basePath, new Path(s"/part-${UUID.randomUUID()}.parquet")) val writer = ParquetWriter.internalWriter(chunkPath, schema, options) writer.write(record) new StartedUnorderedChunk(chunkPath, writer, acc = 1) } override def close(): Unit = () } private[UnorderedChunk] class StartedUnorderedChunk(chunkPath: Path, writer: ParquetWriter.InternalWriter, acc: Long ) extends UnorderedChunk { override def write(record: RowParquetRecord): UnorderedChunk = { writer.write(record) new StartedUnorderedChunk(chunkPath, writer, acc = acc + 1) } override def close(): Unit = { if (logger.isDebugEnabled) logger.debug(s"$acc records were successfully written to $chunkPath") writer.close() } } } }
Example 3
Source File: SequentialFileSplittingParquetSink.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s import akka.Done import akka.stream.scaladsl.{Flow, Keep, Sink} import org.apache.hadoop.fs.Path import org.apache.parquet.schema.MessageType import org.slf4j.{Logger, LoggerFactory} import scala.concurrent.Future private[parquet4s] object SequentialFileSplittingParquetSink extends IOOps { protected val logger: Logger = LoggerFactory.getLogger(this.getClass) def apply[T: ParquetRecordEncoder : ParquetSchemaResolver](path: Path, maxRecordsPerFile: Long, options: ParquetWriter.Options = ParquetWriter.Options() ): Sink[T, Future[Done]] = { val schema = ParquetSchemaResolver.resolveSchema[T] val valueCodecConfiguration = options.toValueCodecConfiguration validateWritePath(path, options) def encode(data: T): RowParquetRecord = ParquetRecordEncoder.encode[T](data, valueCodecConfiguration) Flow[T] .zipWithIndex .map { case (elem, index) => OrderedChunkElem(encode(elem), index) } .fold(OrderedChunk(path, schema, maxRecordsPerFile, options))(_.write(_)) .map(_.close()) .toMat(Sink.ignore)(Keep.right) } private case class OrderedChunkElem(record: RowParquetRecord, index: Long) { def isSplit(maxRecordsPerFile: Long): Boolean = index % maxRecordsPerFile == 0 } private trait OrderedChunk { def write(elem: OrderedChunkElem): OrderedChunk def close(): Unit } private object OrderedChunk { def apply(basePath: Path, schema: MessageType, maxRecordsPerFile: Long, options: ParquetWriter.Options): OrderedChunk = new PendingOrderedChunk(basePath, schema, maxRecordsPerFile, options) private[OrderedChunk] class PendingOrderedChunk(basePath: Path, schema: MessageType, maxRecordsPerFile: Long, options: ParquetWriter.Options) extends OrderedChunk { override def write(elem: OrderedChunkElem): OrderedChunk = { val chunkNumber: Int = Math.floorDiv(elem.index, maxRecordsPerFile).toInt val chunkPath = Path.mergePaths(basePath, new Path(chunkFileName(chunkNumber))) val writer = ParquetWriter.internalWriter(chunkPath, schema, options) writer.write(elem.record) new StartedOrderedChunk(basePath, schema, maxRecordsPerFile, options, chunkPath, writer, acc = 1) } override def close(): Unit = () private def chunkFileName(chunkNumber: Int): String = f"/part-$chunkNumber%05d.parquet" } private[OrderedChunk] class StartedOrderedChunk(basePath: Path, schema: MessageType, maxRecordsPerFile: Long, options: ParquetWriter.Options, chunkPath: Path, writer: ParquetWriter.InternalWriter, acc: Long) extends OrderedChunk { override def write(elem: OrderedChunkElem): OrderedChunk = { if (elem.isSplit(maxRecordsPerFile)) { this.close() new PendingOrderedChunk(basePath, schema, maxRecordsPerFile, options).write(elem) } else { writer.write(elem.record) new StartedOrderedChunk(basePath, schema, maxRecordsPerFile, options, chunkPath, writer, acc = acc + 1) } } override def close(): Unit = { if (logger.isDebugEnabled) logger.debug(s"$acc records were successfully written to $chunkPath") writer.close() } } } }
Example 4
Source File: ParquetPublisher.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import java.util.concurrent.atomic.AtomicBoolean import com.sksamuel.exts.Logging import com.sksamuel.exts.OptionImplicits._ import com.sksamuel.exts.io.Using import io.eels.component.parquet.util.ParquetIterator import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription} import io.eels.schema.StructType import io.eels.{Predicate, Row} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.format.converter.ParquetMetadataConverter import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.schema.MessageType class ParquetPublisher(path: Path, predicate: Option[Predicate], projection: Seq[String], caseSensitive: Boolean, dictionaryFiltering: Boolean) (implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using { def readSchema: Option[MessageType] = { if (projection.isEmpty) None else { val fileSchema = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER).getFileMetaData.getSchema val structType = ParquetSchemaFns.fromParquetMessageType(fileSchema) if (caseSensitive) { assert( structType.fieldNames.toSet.size == structType.fieldNames.map(_.toLowerCase).toSet.size, "Cannot use case sensitive = true when this would result in a clash of field names" ) } val projectionSchema = StructType(projection.map { field => structType.field(field, caseSensitive).getOrError(s"Requested field $field does not exist in the parquet schema") }) ParquetSchemaFns.toParquetMessageType(projectionSchema).some } } override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = { try { using(RowParquetReaderFn(path, predicate, readSchema, dictionaryFiltering)) { reader => val running = new AtomicBoolean(true) subscriber.subscribed(Subscription.fromRunning(running)) ParquetIterator(reader) .grouped(DataStream.DefaultBatchSize) .takeWhile(_ => running.get) .foreach(subscriber.next) subscriber.completed() } } catch { case t: Throwable => subscriber.error(t) } } }
Example 5
Source File: RowWriteSupport.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import io.eels.Row import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.MessageType import scala.collection.JavaConverters._ import scala.math.BigDecimal.RoundingMode.RoundingMode // implementation of WriteSupport for Row's used by the native ParquetWriter class RowWriteSupport(schema: MessageType, roundingMode: RoundingMode, metadata: Map[String, String]) extends WriteSupport[Row] with Logging { logger.trace(s"Created parquet row write support for schema message type $schema") private var writer: RowWriter = _ override def finalizeWrite(): FinalizedWriteContext = new FinalizedWriteContext(metadata.asJava) def init(configuration: Configuration): WriteSupport.WriteContext = { new WriteSupport.WriteContext(schema, new java.util.HashMap()) } def prepareForWrite(record: RecordConsumer) { writer = new RowWriter(record, roundingMode) } def write(row: Row) { writer.write(row) } } class RowWriter(record: RecordConsumer, roundingMode: RoundingMode) { def write(row: Row): Unit = { record.startMessage() val writer = new StructRecordWriter(row.schema, roundingMode, false) writer.write(record, row.values) record.endMessage() } }
Example 6
Source File: RowParquetWriterFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.Row import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter} import org.apache.parquet.schema.MessageType import scala.math.BigDecimal.RoundingMode.RoundingMode object RowParquetWriterFn { class RowParquetWriterBuilder(path: Path, schema: MessageType, roundingMode: RoundingMode, metadata: Map[String, String]) extends ParquetWriter.Builder[Row, RowParquetWriterBuilder](path) { override def getWriteSupport(conf: Configuration): WriteSupport[Row] = new RowWriteSupport(schema, roundingMode, metadata) override def self(): RowParquetWriterBuilder = this } def apply(path: Path, schema: StructType, metadata: Map[String, String], dictionary: Boolean, roundingMode: RoundingMode, fsConfig: Configuration): ParquetWriter[Row] = { val config = ParquetWriterConfig() val messageType = ParquetSchemaFns.toParquetMessageType(schema) new RowParquetWriterBuilder(path, messageType, roundingMode, metadata) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(dictionary) .withPageSize(config.pageSize) .withRowGroupSize(config.blockSize) .withValidation(config.validating) .withWriteMode(ParquetFileWriter.Mode.CREATE) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withConf(fsConfig) .build() } }
Example 7
Source File: DirectParquetWriter.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConverters._ import org.apache.hadoop.conf import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.ParquetWriter import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.WriteContext import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.{MessageType, MessageTypeParser} private class DirectWriteSupport(schema: MessageType, metadata: Map[String, String]) extends WriteSupport[RecordBuilder] { private var recordConsumer: RecordConsumer = _ //初始化 override def init(configuration: conf.Configuration): WriteContext = { new WriteContext(schema, metadata.asJava) } //写操作 override def write(buildRecord: RecordBuilder): Unit = { recordConsumer.startMessage() buildRecord(recordConsumer) recordConsumer.endMessage() } //准备写 override def prepareForWrite(recordConsumer: RecordConsumer): Unit = { this.recordConsumer = recordConsumer } } //直接写入 def writeDirect (path: String, schema: String, metadata: Map[String, String] = Map.empty) (f: ParquetWriter[RecordBuilder] => Unit): Unit = { //println("==1111==") val messageType = MessageTypeParser.parseMessageType(schema) val writeSupport = new DirectWriteSupport(messageType, metadata) // println("==2222==") val parquetWriter = new ParquetWriter[RecordBuilder](new Path(path), writeSupport) // println("==3333==") try f(parquetWriter) finally parquetWriter.close() } //消息 def message(writer: ParquetWriter[RecordBuilder])(builder: RecordBuilder): Unit = { writer.write(builder) } //分组 def group(consumer: RecordConsumer)(f: => Unit): Unit = { consumer.startGroup() f consumer.endGroup() } //字段 def field(consumer: RecordConsumer, name: String, index: Int = 0)(f: => Unit): Unit = { consumer.startField(name, index) f consumer.endField(name, index) } }
Example 8
Source File: ParquetCompatibilityTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConversions._ import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.schema.MessageType import org.apache.spark.sql.QueryTest private[sql] abstract class ParquetCompatibilityTest extends QueryTest with ParquetTest { protected def readParquetSchema(path: String): MessageType = { readParquetSchema(path, { path => !path.getName.startsWith("_") }) } //读Parquet模式 protected def readParquetSchema(path: String, pathFilter: Path => Boolean): MessageType = { val fsPath = new Path(path) val fs = fsPath.getFileSystem(configuration) val parquetFiles = fs.listStatus(fsPath, new PathFilter { override def accept(path: Path): Boolean = pathFilter(path) }).toSeq val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true) footers.head.getParquetMetadata.getFileMetaData.getSchema } protected def logParquetSchema(path: String): Unit = { logInfo( //由parquet-avro写的Parquet文件的模式 s"""Schema of the Parquet file written by parquet-avro: |${readParquetSchema(path)} """.stripMargin) } } //复合Parquet的兼容性测试 object ParquetCompatibilityTest { def makeNullable[T <: AnyRef](i: Int)(f: => T): T = { if (i % 3 == 0) null.asInstanceOf[T] else f } }
Example 9
Source File: ParquetCompatibilityTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.WriteContext import org.apache.parquet.hadoop.{ParquetFileReader, ParquetWriter} import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.{MessageType, MessageTypeParser} import org.apache.spark.sql.QueryTest def writeDirect( path: String, schema: String, metadata: Map[String, String], recordWriters: (RecordConsumer => Unit)*): Unit = { val messageType = MessageTypeParser.parseMessageType(schema) val writeSupport = new DirectWriteSupport(messageType, metadata) val parquetWriter = new ParquetWriter[RecordConsumer => Unit](new Path(path), writeSupport) try recordWriters.foreach(parquetWriter.write) finally parquetWriter.close() } }
Example 10
Source File: StructReadSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import java.util import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.Struct import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema.MessageType class StructReadSupport extends ReadSupport[Struct] { override def prepareForRead(configuration: Configuration, metaData: util.Map[String, String], fileSchema: MessageType, context: ReadSupport.ReadContext): RecordMaterializer[Struct] = { // the file schema in here comes from the footer of the parquet file val schema = ParquetSchemas.toKafka(fileSchema) new StructMaterializer(schema) } override def init(context: InitContext): ReadSupport.ReadContext = { new ReadSupport.ReadContext(context.getFileSchema) } }
Example 11
Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive._ import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.MessageType import scala.collection.JavaConverters._ // derived from Apache Spark's parquet write support, archive and license here: // https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) private val schemaName = if (schema.name() == null) "schema" else schema.name() private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName) private val metadata = new java.util.HashMap[String, String]() metadata.put("written_by", "streamreactor") // The Parquet `RecordConsumer` to which all structs are written private var consumer: RecordConsumer = _ type ValueWriter = (Any) => Unit override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String]) override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata) override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer override def write(struct: Struct): Unit = { writeMessage { writeStructFields(struct) } } private def writeStructFields(struct: Struct): Unit = { for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) { val value = struct.get(field) if (value != null) { val writer = valueWriter(field.schema()) writeField(field.name, index) { writer(value) } } } } def valueWriter(schema: Schema): ValueWriter = { // todo perhaps introduce something like spark's SpecializedGetters schema.`type`() match { case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean]) case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt) case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong) case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes)) case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat) case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble) case Schema.Type.STRUCT => value => { logger.debug(s"Writing nested struct") val struct = value.asInstanceOf[Struct] writeGroup { schema.fields.asScala .map { field => field -> struct.get(field) } .zipWithIndex.foreach { case ((field, v), k) => writeField(field.name, k) { valueWriter(field.schema)(v) } } } } case _ => throw UnsupportedSchemaType(schema.`type`.toString) } } private def writeMessage(f: => Unit): Unit = { consumer.startMessage() f consumer.endMessage() } private def writeGroup(f: => Unit): Unit = { consumer.startGroup() // consumer.startMessage() f //consumer.endMessage() consumer.endGroup() } private def writeField(name: String, k: Int)(f: => Unit): Unit = { consumer.startField(name, k) f consumer.endField(name, k) } }
Example 12
Source File: StructReadSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import java.util import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.Struct import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema.MessageType class StructReadSupport extends ReadSupport[Struct] { override def prepareForRead(configuration: Configuration, metaData: util.Map[String, String], fileSchema: MessageType, context: ReadSupport.ReadContext): RecordMaterializer[Struct] = { // the file schema in here comes from the footer of the parquet file val schema = ParquetSchemas.toKafka(fileSchema) new StructMaterializer(schema) } override def init(context: InitContext): ReadSupport.ReadContext = { new ReadSupport.ReadContext(context.getFileSchema) } }
Example 13
Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.parquet import com.landoop.streamreactor.connect.hive._ import org.apache.hadoop.conf.Configuration import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext import org.apache.parquet.io.api.{Binary, RecordConsumer} import org.apache.parquet.schema.MessageType import scala.collection.JavaConverters._ // derived from Apache Spark's parquet write support, archive and license here: // https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) private val schemaName = if (schema.name() == null) "schema" else schema.name() private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName) private val metadata = new java.util.HashMap[String, String]() metadata.put("written_by", "streamreactor") // The Parquet `RecordConsumer` to which all structs are written private var consumer: RecordConsumer = _ type ValueWriter = (Any) => Unit override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String]) override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata) override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer override def write(struct: Struct): Unit = { writeMessage { writeStructFields(struct) } } private def writeStructFields(struct: Struct): Unit = { for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) { val value = struct.get(field) if (value != null) { val writer = valueWriter(field.schema()) writeField(field.name, index) { writer(value) } } } } def valueWriter(schema: Schema): ValueWriter = { // todo perhaps introduce something like spark's SpecializedGetters schema.`type`() match { case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean]) case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt) case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong) case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes)) case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat) case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble) case Schema.Type.STRUCT => value => { logger.debug(s"Writing nested struct") val struct = value.asInstanceOf[Struct] writeGroup { schema.fields.asScala .map { field => field -> struct.get(field) } .zipWithIndex.foreach { case ((field, v), k) => writeField(field.name, k) { valueWriter(field.schema)(v) } } } } case _ => throw UnsupportedSchemaType(schema.`type`.toString) } } private def writeMessage(f: => Unit): Unit = { consumer.startMessage() f consumer.endMessage() } private def writeGroup(f: => Unit): Unit = { consumer.startGroup() // consumer.startMessage() f //consumer.endMessage() consumer.endGroup() } private def writeField(name: String, k: Int)(f: => Unit): Unit = { consumer.startField(name, k) f consumer.endField(name, k) } }