org.apache.hadoop.io.compress.CompressionCodec Scala Examples
The following examples show how to use org.apache.hadoop.io.compress.CompressionCodec.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SequenceFileRDDFunctions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.{classTag, ClassTag} import org.apache.hadoop.io.Writable import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.internal.Logging def saveAsSequenceFile( path: String, codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope { def anyToWritable[U <% Writable](u: U): Writable = u // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and // valueWritableClass at the compile time. To implement that, we need to add type parameters to // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a // breaking change. val convertKey = self.keyClass != keyWritableClass val convertValue = self.valueClass != valueWritableClass logInfo("Saving as sequence file of type (" + keyWritableClass.getSimpleName + "," + valueWritableClass.getSimpleName + ")" ) val format = classOf[SequenceFileOutputFormat[Writable, Writable]] val jobConf = new JobConf(self.context.hadoopConfiguration) if (!convertKey && !convertValue) { self.saveAsHadoopFile(path, keyWritableClass, valueWritableClass, format, jobConf, codec) } else if (!convertKey && convertValue) { self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile( path, keyWritableClass, valueWritableClass, format, jobConf, codec) } else if (convertKey && !convertValue) { self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile( path, keyWritableClass, valueWritableClass, format, jobConf, codec) } else if (convertKey && convertValue) { self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile( path, keyWritableClass, valueWritableClass, format, jobConf, codec) } } }
Example 2
Source File: CodecFactory.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream} import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.compress.{CodecPool, CompressionCodec} import org.apache.hadoop.util.ReflectionUtils import org.apache.parquet.format.{CompressionCodec => ParquetCodec} import org.apache.parquet.hadoop.metadata.CompressionCodecName // This is a simple version of parquet's CodeFactory. // TODO: [linhong] Need change this into Scala Code style private[oap] class CodecFactory(conf: Configuration) { private val compressors = new mutable.HashMap[ParquetCodec, BytesCompressor] private val decompressors = new mutable.HashMap[ParquetCodec, BytesDecompressor] private val codecByName = new mutable.HashMap[String, CompressionCodec] private def getCodec(codecString: String): Option[CompressionCodec] = { codecByName.get(codecString) match { case Some(codec) => Some(codec) case None => val codecName = CompressionCodecName.valueOf(codecString) val codecClass = codecName.getHadoopCompressionCodecClass if (codecClass == null) { None } else { val codec = ReflectionUtils.newInstance(codecClass, conf).asInstanceOf[CompressionCodec] codecByName.put(codecString, codec) Some(codec) } } } def getCompressor(codec: ParquetCodec): BytesCompressor = { compressors.getOrElseUpdate(codec, new BytesCompressor(getCodec(codec.name))) } def getDecompressor(codec: ParquetCodec): BytesDecompressor = { decompressors.getOrElseUpdate(codec, new BytesDecompressor(getCodec(codec.name))) } def release(): Unit = { compressors.values.foreach(_.release()) compressors.clear() decompressors.values.foreach(_.release()) decompressors.clear() } } private[oap] class BytesCompressor(compressionCodec: Option[CompressionCodec]) { private lazy val compressedOutBuffer = new ByteArrayOutputStream() private lazy val compressor = compressionCodec match { case Some(codec) => CodecPool.getCompressor(codec) case None => null } def compress(bytes: Array[Byte]): Array[Byte] = { compressionCodec match { case Some(codec) => compressedOutBuffer.reset() // null compressor for non-native gzip if (compressor != null) { compressor.reset() } val cos = codec.createOutputStream(compressedOutBuffer, compressor) cos.write(bytes) cos.finish() cos.close() compressedOutBuffer.toByteArray case None => bytes } } def release(): Unit = CodecPool.returnCompressor(compressor) } private[oap] class BytesDecompressor(compressionCodec: Option[CompressionCodec]) { private lazy val decompressor = compressionCodec match { case Some(codec) => CodecPool.getDecompressor(codec) case None => null } def decompress(bytes: Array[Byte], uncompressedSize: Int): Array[Byte] = { compressionCodec match { case Some(codec) => decompressor.reset() val cis = codec.createInputStream(new ByteArrayInputStream(bytes), decompressor) val decompressed = new Array[Byte](uncompressedSize) new DataInputStream(cis).readFully(decompressed) decompressed case None => bytes } } def release(): Unit = CodecPool.returnDecompressor(decompressor) }
Example 3
Source File: SequenceFileRDDFunctions.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.hadoop.io.Writable import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.internal.Logging def saveAsSequenceFile( path: String, codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope { def anyToWritable[U <% Writable](u: U): Writable = u // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and // valueWritableClass at the compile time. To implement that, we need to add type parameters to // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a // breaking change. val convertKey = self.keyClass != _keyWritableClass val convertValue = self.valueClass != _valueWritableClass logInfo("Saving as sequence file of type " + s"(${_keyWritableClass.getSimpleName},${_valueWritableClass.getSimpleName})" ) val format = classOf[SequenceFileOutputFormat[Writable, Writable]] val jobConf = new JobConf(self.context.hadoopConfiguration) if (!convertKey && !convertValue) { self.saveAsHadoopFile(path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } else if (!convertKey && convertValue) { self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile( path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } else if (convertKey && !convertValue) { self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile( path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } else if (convertKey && convertValue) { self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile( path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } } }
Example 4
Source File: IOCommon.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.common import java.io.{File, FileInputStream, IOException, InputStreamReader} import java.util.Properties import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.rdd.RDD import org.apache.spark.{SparkContext, SparkException} import scala.collection.JavaConversions._ import scala.collection.mutable.HashMap import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag class IOCommon(val sc:SparkContext) { def load[T:ClassTag:TypeTag](filename:String, force_format:Option[String]=None) = { val input_format = force_format.getOrElse( IOCommon.getProperty("sparkbench.inputformat").getOrElse("Text")) input_format match { case "Text" => sc.textFile(filename) case "Sequence" => sc.sequenceFile[NullWritable, Text](filename).map(_._2.toString) case _ => throw new UnsupportedOperationException(s"Unknown inpout format: $input_format") } } def save(filename:String, data:RDD[_], prefix:String) = { val output_format = IOCommon.getProperty(prefix).getOrElse("Text") val output_format_codec = loadClassByName[CompressionCodec](IOCommon.getProperty(prefix + ".codec")) output_format match { case "Text" => if (output_format_codec.isEmpty) data.saveAsTextFile(filename) else data.saveAsTextFile(filename, output_format_codec.get) case "Sequence" => val sequence_data = data.map(x => (NullWritable.get(), new Text(x.toString))) if (output_format_codec.isEmpty) { sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename) } else { sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename, output_format_codec.get) } case _ => throw new UnsupportedOperationException(s"Unknown output format: $output_format") } } def save(filename:String, data:RDD[_]):Unit = save(filename, data, "sparkbench.outputformat") private def loadClassByName[T](name:Option[String]) = { if (!name.isEmpty) Some(Class.forName(name.get) .newInstance.asInstanceOf[T].getClass) else None } private def callMethod[T, R](obj:T, method_name:String) = obj.getClass.getMethod(method_name).invoke(obj).asInstanceOf[R] } object IOCommon { private val sparkbench_conf: HashMap[String, String] = getPropertiesFromFile(System.getenv("SPARKBENCH_PROPERTIES_FILES")) def getPropertiesFromFile(filenames: String): HashMap[String, String] = { val result = new HashMap[String, String] filenames.split(',').filter(_.stripMargin.length > 0).foreach { filename => val file = new File(filename) require(file.exists, s"Properties file $file does not exist") require(file.isFile, s"Properties file $file is not a normal file") val inReader = new InputStreamReader(new FileInputStream(file), "UTF-8") try { val properties = new Properties() properties.load(inReader) result ++= properties.stringPropertyNames() .map(k => (k, properties(k).trim)).toMap } catch { case e: IOException => val message = s"Failed when loading Sparkbench properties file $file" throw new SparkException(message, e) } finally { inReader.close() } } result.filter{case (key, value) => value.toLowerCase != "none"} } def getProperty(key:String):Option[String] = sparkbench_conf.get(key) def dumpProperties(): Unit = sparkbench_conf .foreach{case (key, value)=> println(s"$key\t\t$value")} }