org.apache.parquet.hadoop.metadata.CompressionCodecName Scala Examples
The following examples show how to use org.apache.parquet.hadoop.metadata.CompressionCodecName.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ParquetOptions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.sql.internal.SQLConf val mergeSchema: Boolean = parameters .get(MERGE_SCHEMA) .map(_.toBoolean) .getOrElse(sqlConf.isParquetSchemaMergingEnabled) } object ParquetOptions { val MERGE_SCHEMA = "mergeSchema" // The parquet compression short names private val shortParquetCompressionCodecNames = Map( "none" -> CompressionCodecName.UNCOMPRESSED, "uncompressed" -> CompressionCodecName.UNCOMPRESSED, "snappy" -> CompressionCodecName.SNAPPY, "gzip" -> CompressionCodecName.GZIP, "lzo" -> CompressionCodecName.LZO) }
Example 2
Source File: ParquetAvroDataSourceSpec.scala From utils with Apache License 2.0 | 5 votes |
package com.indix.utils.spark.parquet import java.io.File import com.google.common.io.Files import com.indix.utils.spark.parquet.avro.ParquetAvroDataSource import org.apache.commons.io.FileUtils import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.sql.SparkSession import org.scalactic.Equality import org.scalatest.Matchers.{be, convertToAnyShouldWrapper, equal} import org.scalatest.{BeforeAndAfterAll, FlatSpec} import java.util.{Arrays => JArrays} case class SampleAvroRecord(a: Int, b: String, c: Seq[String], d: Boolean, e: Double, f: collection.Map[String, String], g: Array[Byte]) class ParquetAvroDataSourceSpec extends FlatSpec with BeforeAndAfterAll with ParquetAvroDataSource { private var spark: SparkSession = _ implicit val sampleAvroRecordEq = new Equality[SampleAvroRecord] { override def areEqual(left: SampleAvroRecord, b: Any): Boolean = b match { case right: SampleAvroRecord => left.a == right.a && left.b == right.b && Equality.default[Seq[String]].areEqual(left.c, right.c) && left.d == right.d && left.e == right.e && Equality.default[collection.Map[String, String]].areEqual(left.f, right.f) && JArrays.equals(left.g, right.g) case _ => false } } override protected def beforeAll(): Unit = { super.beforeAll() spark = SparkSession.builder().master("local[2]").appName("ParquetAvroDataSource").getOrCreate() } override protected def afterAll(): Unit = { try { spark.sparkContext.stop() } finally { super.afterAll() } } "AvroBasedParquetDataSource" should "read/write avro records as ParquetData" in { val outputLocation = Files.createTempDir().getAbsolutePath + "/output" val sampleRecords: Seq[SampleAvroRecord] = Seq( SampleAvroRecord(1, "1", List("a1"), true, 1.0d, Map("a1" -> "b1"), "1".getBytes), SampleAvroRecord(2, "2", List("a2"), false, 2.0d, Map("a2" -> "b2"), "2".getBytes), SampleAvroRecord(3, "3", List("a3"), true, 3.0d, Map("a3" -> "b3"), "3".getBytes), SampleAvroRecord(4, "4", List("a4"), true, 4.0d, Map("a4" -> "b4"), "4".getBytes), SampleAvroRecord(5, "5", List("a5"), false, 5.0d, Map("a5" -> "b5"), "5".getBytes) ) val sampleDf = spark.createDataFrame(sampleRecords) sampleDf.rdd.saveAvroInParquet(outputLocation, sampleDf.schema, CompressionCodecName.GZIP) val sparkVal = spark import sparkVal.implicits._ val records: Array[SampleAvroRecord] = spark.read.parquet(outputLocation).as[SampleAvroRecord].collect() records.length should be(5) // We use === to use the custom Equality defined above for comparing Array[Byte] // Ref - https://github.com/scalatest/scalatest/issues/491 records.sortBy(_.a) === sampleRecords.sortBy(_.a) FileUtils.deleteDirectory(new File(outputLocation)) } }
Example 3
Source File: MessageSink.scala From parquet4s with MIT License | 5 votes |
package com.github.mjakubowski84.parquet4s.indefinite import java.sql.Timestamp import java.util.UUID import akka.Done import akka.kafka.CommitterSettings import akka.kafka.ConsumerMessage.CommittableOffsetBatch import akka.kafka.scaladsl.Committer import akka.stream.scaladsl.{Flow, Keep, Sink} import com.github.mjakubowski84.parquet4s.{ChunkPathBuilder, ParquetStreams, ParquetWriter} import com.google.common.io.Files import org.apache.hadoop.fs.Path import org.apache.parquet.hadoop.metadata.CompressionCodecName import scala.concurrent.Future import scala.concurrent.duration._ object MessageSink { case class Data(timestamp: Timestamp, word: String) val MaxChunkSize: Int = 128 val ChunkWriteTimeWindow: FiniteDuration = 10.seconds val WriteDirectoryName: String = "messages" } trait MessageSink { this: Akka => import MessageSink._ import MessageSource._ protected val baseWritePath: String = new Path(Files.createTempDir().getAbsolutePath, WriteDirectoryName).toString private val writerOptions = ParquetWriter.Options(compressionCodecName = CompressionCodecName.SNAPPY) private lazy val committerSink = Flow.apply[Seq[Message]].map { messages => CommittableOffsetBatch(messages.map(_.committableOffset)) }.toMat(Committer.sink(CommitterSettings(system)))(Keep.right) def chunkPath: ChunkPathBuilder[Message] = { case (basePath, chunk) => val lastElementDateTime = new Timestamp(chunk.last.record.timestamp()).toLocalDateTime val year = lastElementDateTime.getYear val month = lastElementDateTime.getMonthValue val day = lastElementDateTime.getDayOfMonth val uuid = UUID.randomUUID() basePath.suffix(s"/$year/$month/$day/part-$uuid.parquet") } lazy val messageSink: Sink[Message, Future[Done]] = ParquetStreams.toParquetIndefinite( path = baseWritePath, maxChunkSize = MaxChunkSize, chunkWriteTimeWindow = ChunkWriteTimeWindow, buildChunkPath = chunkPath, preWriteTransformation = { message: Message => Data( timestamp = new Timestamp(message.record.timestamp()), word = message.record.value() ) }, postWriteSink = committerSink, options = writerOptions ) }
Example 4
Source File: ParquetWriterConfig.scala From eel-sdk with Apache License 2.0 | 5 votes |
package io.eels.component.parquet import com.sksamuel.exts.Logging import com.sksamuel.exts.config.ConfigSupport import com.typesafe.config.{Config, ConfigFactory} import org.apache.parquet.hadoop.ParquetWriter import org.apache.parquet.hadoop.metadata.CompressionCodecName case class ParquetWriterConfig(blockSize: Int, pageSize: Int, compressionCodec: CompressionCodecName, enableDictionary: Boolean, validating: Boolean) object ParquetWriterConfig extends Logging with ConfigSupport { def apply(): ParquetWriterConfig = apply(ConfigFactory.load()) def apply(config: Config): ParquetWriterConfig = { val blockSize: Int = config.getIntOrElse("eel.parquet.blockSize", ParquetWriter.DEFAULT_BLOCK_SIZE) val pageSize: Int = config.getIntOrElse("eel.parquet.pageSize", ParquetWriter.DEFAULT_PAGE_SIZE) val compressionCodec = config.getString("eel.parquet.compressionCodec").toLowerCase() match { case "gzip" => CompressionCodecName.GZIP case "lzo" => CompressionCodecName.LZO case "snappy" => CompressionCodecName.SNAPPY case _ => CompressionCodecName.UNCOMPRESSED } logger.debug(s"Parquet writer will use blockSize = $blockSize; pageSize = $pageSize; compressionCodec = $compressionCodec") ParquetWriterConfig(blockSize, pageSize, compressionCodec, true, true) } }
Example 5
Source File: ParquetOptions.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import java.util.Locale import org.apache.parquet.hadoop.ParquetOutputFormat import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.internal.SQLConf val mergeSchema: Boolean = parameters .get(MERGE_SCHEMA) .map(_.toBoolean) .getOrElse(sqlConf.isParquetSchemaMergingEnabled) } object ParquetOptions { val MERGE_SCHEMA = "mergeSchema" // The parquet compression short names private val shortParquetCompressionCodecNames = Map( "none" -> CompressionCodecName.UNCOMPRESSED, "uncompressed" -> CompressionCodecName.UNCOMPRESSED, "snappy" -> CompressionCodecName.SNAPPY, "gzip" -> CompressionCodecName.GZIP, "lzo" -> CompressionCodecName.LZO, "lz4" -> CompressionCodecName.LZ4, "brotli" -> CompressionCodecName.BROTLI, "zstd" -> CompressionCodecName.ZSTD) def getParquetCompressionCodecName(name: String): String = { shortParquetCompressionCodecNames(name).name() } }
Example 6
Source File: CodecFactory.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream} import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.compress.{CodecPool, CompressionCodec} import org.apache.hadoop.util.ReflectionUtils import org.apache.parquet.format.{CompressionCodec => ParquetCodec} import org.apache.parquet.hadoop.metadata.CompressionCodecName // This is a simple version of parquet's CodeFactory. // TODO: [linhong] Need change this into Scala Code style private[oap] class CodecFactory(conf: Configuration) { private val compressors = new mutable.HashMap[ParquetCodec, BytesCompressor] private val decompressors = new mutable.HashMap[ParquetCodec, BytesDecompressor] private val codecByName = new mutable.HashMap[String, CompressionCodec] private def getCodec(codecString: String): Option[CompressionCodec] = { codecByName.get(codecString) match { case Some(codec) => Some(codec) case None => val codecName = CompressionCodecName.valueOf(codecString) val codecClass = codecName.getHadoopCompressionCodecClass if (codecClass == null) { None } else { val codec = ReflectionUtils.newInstance(codecClass, conf).asInstanceOf[CompressionCodec] codecByName.put(codecString, codec) Some(codec) } } } def getCompressor(codec: ParquetCodec): BytesCompressor = { compressors.getOrElseUpdate(codec, new BytesCompressor(getCodec(codec.name))) } def getDecompressor(codec: ParquetCodec): BytesDecompressor = { decompressors.getOrElseUpdate(codec, new BytesDecompressor(getCodec(codec.name))) } def release(): Unit = { compressors.values.foreach(_.release()) compressors.clear() decompressors.values.foreach(_.release()) decompressors.clear() } } private[oap] class BytesCompressor(compressionCodec: Option[CompressionCodec]) { private lazy val compressedOutBuffer = new ByteArrayOutputStream() private lazy val compressor = compressionCodec match { case Some(codec) => CodecPool.getCompressor(codec) case None => null } def compress(bytes: Array[Byte]): Array[Byte] = { compressionCodec match { case Some(codec) => compressedOutBuffer.reset() // null compressor for non-native gzip if (compressor != null) { compressor.reset() } val cos = codec.createOutputStream(compressedOutBuffer, compressor) cos.write(bytes) cos.finish() cos.close() compressedOutBuffer.toByteArray case None => bytes } } def release(): Unit = CodecPool.returnCompressor(compressor) } private[oap] class BytesDecompressor(compressionCodec: Option[CompressionCodec]) { private lazy val decompressor = compressionCodec match { case Some(codec) => CodecPool.getDecompressor(codec) case None => null } def decompress(bytes: Array[Byte], uncompressedSize: Int): Array[Byte] = { compressionCodec match { case Some(codec) => decompressor.reset() val cis = codec.createInputStream(new ByteArrayInputStream(bytes), decompressor) val decompressed = new Array[Byte](uncompressedSize) new DataInputStream(cis).readFully(decompressed) decompressed case None => bytes } } def release(): Unit = CodecPool.returnDecompressor(decompressor) }
Example 7
Source File: ParquetOptions.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.internal.SQLConf val mergeSchema: Boolean = parameters .get(MERGE_SCHEMA) .map(_.toBoolean) .getOrElse(sqlConf.isParquetSchemaMergingEnabled) } object ParquetOptions { val MERGE_SCHEMA = "mergeSchema" // The parquet compression short names private val shortParquetCompressionCodecNames = Map( "none" -> CompressionCodecName.UNCOMPRESSED, "uncompressed" -> CompressionCodecName.UNCOMPRESSED, "snappy" -> CompressionCodecName.SNAPPY, "gzip" -> CompressionCodecName.GZIP, "lzo" -> CompressionCodecName.LZO) }
Example 8
Source File: DataWriterFactory.scala From CM-Well with Apache License 2.0 | 5 votes |
package cmwell.analytics.data import java.io.File import java.nio.file.Paths import akka.actor.ActorSystem import akka.stream.ActorMaterializer import cmwell.analytics.util.Shard import org.apache.avro.generic.GenericRecord import org.apache.commons.io.FileUtils import org.apache.parquet.hadoop.metadata.CompressionCodecName import scala.concurrent.ExecutionContextExecutor trait DataWriterFactory[T <: GenericRecord] { def apply(shard: Shard): DataWriter[T] } object DataWriterFactory { private val compressionCodec = CompressionCodecName.SNAPPY def file[T <: GenericRecord with CsvGenerator](format: String, objectExtractor: ObjectExtractor[T], outDirectory: String): Shard => DataWriter[T] = { val extension = s".$format" + (if (format == "parquet") s"${compressionCodec.getExtension}" else "") // Generate a meaningful file name for the target file name based on the source shard index name and shard number. (sourceShard: Shard) => { val outFile: File = Paths.get(outDirectory, s"part-r-${sourceShard.indexName}.${sourceShard.shard}$extension").toFile if (outFile.exists) FileUtils.forceDelete(outFile) new File(outFile.getParent).mkdirs() FileDataWriter[T](format, objectExtractor.schema, outFile.toString, compressionCodec) } } def index[T <: GenericRecord](indexMap: Map[String, String], // source-index -> target-index esEndpoint: String) (implicit system: ActorSystem, executionContext: ExecutionContextExecutor, actorMaterializer: ActorMaterializer ): Shard => DataWriter[T] = { (sourceShard: Shard) => { val targetIndex = indexMap(sourceShard.indexName) new IndexDataWriter[T](indexName = targetIndex, esEndpoint = esEndpoint) } } }
Example 9
Source File: ParquetOptions.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.internal.SQLConf val mergeSchema: Boolean = parameters .get(MERGE_SCHEMA) .map(_.toBoolean) .getOrElse(sqlConf.isParquetSchemaMergingEnabled) } object ParquetOptions { val MERGE_SCHEMA = "mergeSchema" // The parquet compression short names private val shortParquetCompressionCodecNames = Map( "none" -> CompressionCodecName.UNCOMPRESSED, "uncompressed" -> CompressionCodecName.UNCOMPRESSED, "snappy" -> CompressionCodecName.SNAPPY, "gzip" -> CompressionCodecName.GZIP, "lzo" -> CompressionCodecName.LZO) }
Example 10
Source File: ParquetOptions.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import java.util.Locale import org.apache.parquet.hadoop.ParquetOutputFormat import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.internal.SQLConf val mergeSchema: Boolean = parameters .get(MERGE_SCHEMA) .map(_.toBoolean) .getOrElse(sqlConf.isParquetSchemaMergingEnabled) } object ParquetOptions { val MERGE_SCHEMA = "mergeSchema" // The parquet compression short names private val shortParquetCompressionCodecNames = Map( "none" -> CompressionCodecName.UNCOMPRESSED, "uncompressed" -> CompressionCodecName.UNCOMPRESSED, "snappy" -> CompressionCodecName.SNAPPY, "gzip" -> CompressionCodecName.GZIP, "lzo" -> CompressionCodecName.LZO) def getParquetCompressionCodecName(name: String): String = { shortParquetCompressionCodecNames(name).name() } }
Example 11
Source File: package.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.parquet import com.spotify.scio.ScioContext import com.spotify.scio.io.ClosedTap import com.spotify.scio.parquet.tensorflow.ParquetExampleIO.WriteParam import com.spotify.scio.values.SCollection import me.lyh.parquet.tensorflow.Schema import org.apache.parquet.filter2.predicate.FilterPredicate import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.tensorflow.example.Example def saveAsParquetExampleFile( path: String, schema: Schema, numShards: Int = WriteParam.DefaultNumShards, suffix: String = WriteParam.DefaultSuffix, compression: CompressionCodecName = WriteParam.DefaultCompression ): ClosedTap[Example] = self.write(ParquetExampleIO(path))(WriteParam(schema, numShards, suffix, compression)) } }