org.apache.parquet.hadoop.ParquetWriter Scala Examples
The following examples show how to use org.apache.parquet.hadoop.ParquetWriter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: S3ParquetPageOutput.scala From embulk-output-s3_parquet with MIT License | 5 votes |
package org.embulk.output.s3_parquet import java.io.File import java.nio.file.{Files, Paths} import com.amazonaws.services.s3.transfer.{TransferManager, Upload} import com.amazonaws.services.s3.transfer.model.UploadResult import org.apache.parquet.hadoop.ParquetWriter import org.embulk.config.TaskReport import org.embulk.output.s3_parquet.aws.Aws import org.embulk.spi.{Exec, Page, PageReader, TransactionalPageOutput} case class S3ParquetPageOutput( outputLocalFile: String, reader: PageReader, writer: ParquetWriter[PageReader], aws: Aws, destBucket: String, destKey: String ) extends TransactionalPageOutput { private var isClosed: Boolean = false override def add(page: Page): Unit = { reader.setPage(page) while (reader.nextRecord()) { ContextClassLoaderSwapper.usingPluginClass { writer.write(reader) } } } override def finish(): Unit = {} override def close(): Unit = { synchronized { if (!isClosed) { ContextClassLoaderSwapper.usingPluginClass { writer.close() } isClosed = true } } } override def abort(): Unit = { close() cleanup() } override def commit(): TaskReport = { close() val result: UploadResult = ContextClassLoaderSwapper.usingPluginClass { aws.withTransferManager { xfer: TransferManager => val upload: Upload = xfer.upload(destBucket, destKey, new File(outputLocalFile)) upload.waitForUploadResult() } } cleanup() Exec .newTaskReport() .set("bucket", result.getBucketName) .set("key", result.getKey) .set("etag", result.getETag) .set("version_id", result.getVersionId) } private def cleanup(): Unit = { Files.delete(Paths.get(outputLocalFile)) } }
Example 2
Source File: ParquetWriterConfig.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import com.sksamuel.exts.config.ConfigSupport import com.typesafe.config.{Config, ConfigFactory} import org.apache.parquet.hadoop.ParquetWriter import org.apache.parquet.hadoop.metadata.CompressionCodecName case class ParquetWriterConfig(blockSize: Int, pageSize: Int, compressionCodec: CompressionCodecName, enableDictionary: Boolean, validating: Boolean) object ParquetWriterConfig extends Logging with ConfigSupport { def apply(): ParquetWriterConfig = apply(ConfigFactory.load()) def apply(config: Config): ParquetWriterConfig = { val blockSize: Int = config.getIntOrElse("eel.parquet.blockSize", ParquetWriter.DEFAULT_BLOCK_SIZE) val pageSize: Int = config.getIntOrElse("eel.parquet.pageSize", ParquetWriter.DEFAULT_PAGE_SIZE) val compressionCodec = config.getString("eel.parquet.compressionCodec").toLowerCase() match { case "gzip" => CompressionCodecName.GZIP case "lzo" => CompressionCodecName.LZO case "snappy" => CompressionCodecName.SNAPPY case _ => CompressionCodecName.UNCOMPRESSED } logger.debug(s"Parquet writer will use blockSize = $blockSize; pageSize = $pageSize; compressionCodec = $compressionCodec") ParquetWriterConfig(blockSize, pageSize, compressionCodec, true, true) } }
Example 3
Source File: AvroParquetWriterFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet.avro import com.sksamuel.exts.Logging import io.eels.component.parquet.ParquetWriterConfig import org.apache.avro.Schema import org.apache.avro.generic.GenericRecord import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetWriter import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter} object AvroParquetWriterFn extends Logging { def apply(path: Path, avroSchema: Schema): ParquetWriter[GenericRecord] = { val config = ParquetWriterConfig() AvroParquetWriter.builder[GenericRecord](path) .withSchema(avroSchema) .withCompressionCodec(config.compressionCodec) .withPageSize(config.pageSize) .withRowGroupSize(config.blockSize) .withDictionaryEncoding(config.enableDictionary) .withWriteMode(ParquetFileWriter.Mode.CREATE) .withValidation(config.validating) .build() } }
Example 4
Source File: RowParquetWriterFn.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import io.eels.Row import io.eels.schema.StructType import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter} import org.apache.parquet.schema.MessageType import scala.math.BigDecimal.RoundingMode.RoundingMode object RowParquetWriterFn { class RowParquetWriterBuilder(path: Path, schema: MessageType, roundingMode: RoundingMode, metadata: Map[String, String]) extends ParquetWriter.Builder[Row, RowParquetWriterBuilder](path) { override def getWriteSupport(conf: Configuration): WriteSupport[Row] = new RowWriteSupport(schema, roundingMode, metadata) override def self(): RowParquetWriterBuilder = this } def apply(path: Path, schema: StructType, metadata: Map[String, String], dictionary: Boolean, roundingMode: RoundingMode, fsConfig: Configuration): ParquetWriter[Row] = { val config = ParquetWriterConfig() val messageType = ParquetSchemaFns.toParquetMessageType(schema) new RowParquetWriterBuilder(path, messageType, roundingMode, metadata) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(dictionary) .withPageSize(config.pageSize) .withRowGroupSize(config.blockSize) .withValidation(config.validating) .withWriteMode(ParquetFileWriter.Mode.CREATE) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withConf(fsConfig) .build() } }
Example 5
Source File: DirectParquetWriter.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConverters._ import org.apache.hadoop.conf import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.ParquetWriter import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.WriteContext import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.{MessageType, MessageTypeParser} private class DirectWriteSupport(schema: MessageType, metadata: Map[String, String]) extends WriteSupport[RecordBuilder] { private var recordConsumer: RecordConsumer = _ //初始化 override def init(configuration: conf.Configuration): WriteContext = { new WriteContext(schema, metadata.asJava) } //写操作 override def write(buildRecord: RecordBuilder): Unit = { recordConsumer.startMessage() buildRecord(recordConsumer) recordConsumer.endMessage() } //准备写 override def prepareForWrite(recordConsumer: RecordConsumer): Unit = { this.recordConsumer = recordConsumer } } //直接写入 def writeDirect (path: String, schema: String, metadata: Map[String, String] = Map.empty) (f: ParquetWriter[RecordBuilder] => Unit): Unit = { //println("==1111==") val messageType = MessageTypeParser.parseMessageType(schema) val writeSupport = new DirectWriteSupport(messageType, metadata) // println("==2222==") val parquetWriter = new ParquetWriter[RecordBuilder](new Path(path), writeSupport) // println("==3333==") try f(parquetWriter) finally parquetWriter.close() } //消息 def message(writer: ParquetWriter[RecordBuilder])(builder: RecordBuilder): Unit = { writer.write(builder) } //分组 def group(consumer: RecordConsumer)(f: => Unit): Unit = { consumer.startGroup() f consumer.endGroup() } //字段 def field(consumer: RecordConsumer, name: String, index: Int = 0)(f: => Unit): Unit = { consumer.startField(name, index) f consumer.endField(name, index) } }
Example 6
Source File: ParquetCompatibilityTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.WriteContext import org.apache.parquet.hadoop.{ParquetFileReader, ParquetWriter} import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.{MessageType, MessageTypeParser} import org.apache.spark.sql.QueryTest def writeDirect( path: String, schema: String, metadata: Map[String, String], recordWriters: (RecordConsumer => Unit)*): Unit = { val messageType = MessageTypeParser.parseMessageType(schema) val writeSupport = new DirectWriteSupport(messageType, metadata) val parquetWriter = new ParquetWriter[RecordConsumer => Unit](new Path(path), writeSupport) try recordWriters.foreach(parquetWriter.write) finally parquetWriter.close() } }
Example 7
Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.Serde import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.ParquetWriter import scala.util.Try object ParquetHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", Map("serialization.format" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating parquet writer at $path") val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val createdTime: Long = System.currentTimeMillis() var lastKnownFileSize: Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { writer.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing writer at path $path") writer.close() } override def currentCount: Long = count override def file: Path = path override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating parquet reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path) var offset = startAt override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 8
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter} package object parquet { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = { if (fs.isDirectory(path)) { logger.debug(s"$path is a directory, reading constituent files") val remote = fs.listFiles(path, false) new Iterator[Path] { override def hasNext: Boolean = remote.hasNext override def next(): Path = remote.next().getPath }.toList } else { logger.debug(s"Reading $path as a single file") List(path) } } def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = { ParquetReader.builder(new StructReadSupport, file) .withConf(fs.getConf) .build() } def parquetWriter(path: Path, schema: Schema, config: ParquetSinkConfig): ParquetWriter[Struct] = { new StructParquetWriterBuilder(path, schema) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(config.enableDictionary) .withValidation(config.validation) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withWriteMode(if (config.overwrite) { ParquetFileWriter.Mode.OVERWRITE } else { ParquetFileWriter.Mode.CREATE }).build() } }
Example 9
Source File: ParquetHiveFormat.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive.formats import com.landoop.streamreactor.connect.hive.Serde import com.landoop.streamreactor.connect.hive.parquet.ParquetSinkConfig import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.fs.permission.FsPermission import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.hadoop.ParquetWriter import scala.util.Try object ParquetHiveFormat extends HiveFormat { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) override def serde = Serde( "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", Map("serialization.format" -> "1") ) override def writer(path: Path, schema: Schema) (implicit fs: FileSystem): HiveWriter = new HiveWriter { logger.debug(s"Creating parquet writer at $path") val writer: ParquetWriter[Struct] = com.landoop.streamreactor.connect.hive.parquet.parquetWriter(path, schema, ParquetSinkConfig(overwrite = true)) Try(fs.setPermission(path, FsPermission.valueOf("-rwxrwxrwx"))) val createdTimestamp: Long = System.currentTimeMillis() var lastKnownFileSize:Long = fs.getFileStatus(path).getLen var readFileSize = false var count = 0 override def write(struct: Struct): Long = { writer.write(struct) count = count + 1 readFileSize = true count } override def close(): Unit = { logger.debug(s"Closing writer at path $path") writer.close() } override def currentCount: Long = count override def file: Path = path override def createdTime: Long = createdTimestamp override def fileSize: Long = { if (readFileSize) { lastKnownFileSize = fs.getFileStatus(path).getLen readFileSize = false } lastKnownFileSize } } override def reader(path: Path, startAt: Int, schema: Schema) (implicit fs: FileSystem): HiveReader = new HiveReader { logger.debug(s"Creating parquet reader for $path with offset $startAt") val reader = com.landoop.streamreactor.connect.hive.parquet.parquetReader(path) var offset = startAt override def iterator: Iterator[Record] = Iterator.continually(reader.read).takeWhile(_ != null).drop(startAt).map { struct => val record = Record(struct, path, offset) offset = offset + 1 record } override def close(): Unit = reader.close() } }
Example 10
Source File: package.scala From stream-reactor with Apache License 2.0 | 5 votes |
package com.landoop.streamreactor.connect.hive import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.kafka.connect.data.{Schema, Struct} import org.apache.parquet.column.ParquetProperties import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetReader, ParquetWriter} package object parquet { private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName) def listFiles(path: Path)(implicit fs: FileSystem): List[Path] = { if (fs.isDirectory(path)) { logger.debug(s"$path is a directory, reading constituent files") val remote = fs.listFiles(path, false) new Iterator[Path] { override def hasNext: Boolean = remote.hasNext override def next(): Path = remote.next().getPath }.toList } else { logger.debug(s"Reading $path as a single file") List(path) } } def parquetReader(file: Path)(implicit fs: FileSystem): ParquetReader[Struct] = { ParquetReader.builder(new StructReadSupport, file) .withConf(fs.getConf) .build() } def parquetWriter(path: Path, schema: Schema, config: ParquetSinkConfig): ParquetWriter[Struct] = { new StructParquetWriterBuilder(path, schema) .withCompressionCodec(config.compressionCodec) .withDictionaryEncoding(config.enableDictionary) .withValidation(config.validation) .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0) .withWriteMode(if (config.overwrite) { ParquetFileWriter.Mode.OVERWRITE } else { ParquetFileWriter.Mode.CREATE }).build() } }