org.apache.hadoop.io.compress.CompressionCodec Scala Examples

The following examples show how to use org.apache.hadoop.io.compress.CompressionCodec. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: SequenceFileRDDFunctions.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.{classTag, ClassTag}

import org.apache.hadoop.io.Writable
import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.SequenceFileOutputFormat

import org.apache.spark.internal.Logging


  def saveAsSequenceFile(
      path: String,
      codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope {
    def anyToWritable[U <% Writable](u: U): Writable = u

    // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and
    // valueWritableClass at the compile time. To implement that, we need to add type parameters to
    // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a
    // breaking change.
    val convertKey = self.keyClass != keyWritableClass
    val convertValue = self.valueClass != valueWritableClass

    logInfo("Saving as sequence file of type (" + keyWritableClass.getSimpleName + "," +
      valueWritableClass.getSimpleName + ")" )
    val format = classOf[SequenceFileOutputFormat[Writable, Writable]]
    val jobConf = new JobConf(self.context.hadoopConfiguration)
    if (!convertKey && !convertValue) {
      self.saveAsHadoopFile(path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    } else if (!convertKey && convertValue) {
      self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile(
        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    } else if (convertKey && !convertValue) {
      self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile(
        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    } else if (convertKey && convertValue) {
      self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile(
        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    }
  }
} 
Example 2
Source File: CodecFactory.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.oap.io

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream}

import scala.collection.mutable

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.compress.{CodecPool, CompressionCodec}
import org.apache.hadoop.util.ReflectionUtils
import org.apache.parquet.format.{CompressionCodec => ParquetCodec}
import org.apache.parquet.hadoop.metadata.CompressionCodecName

// This is a simple version of parquet's CodeFactory.
// TODO: [linhong] Need change this into Scala Code style
private[oap] class CodecFactory(conf: Configuration) {

  private val compressors = new mutable.HashMap[ParquetCodec, BytesCompressor]
  private val decompressors = new mutable.HashMap[ParquetCodec, BytesDecompressor]
  private val codecByName = new mutable.HashMap[String, CompressionCodec]

  private def getCodec(codecString: String): Option[CompressionCodec] = {
    codecByName.get(codecString) match {
      case Some(codec) => Some(codec)
      case None =>
        val codecName = CompressionCodecName.valueOf(codecString)
        val codecClass = codecName.getHadoopCompressionCodecClass
        if (codecClass == null) {
          None
        } else {
          val codec = ReflectionUtils.newInstance(codecClass, conf).asInstanceOf[CompressionCodec]
          codecByName.put(codecString, codec)
          Some(codec)
        }
    }
  }

  def getCompressor(codec: ParquetCodec): BytesCompressor = {
    compressors.getOrElseUpdate(codec, new BytesCompressor(getCodec(codec.name)))
  }

  def getDecompressor(codec: ParquetCodec): BytesDecompressor = {
    decompressors.getOrElseUpdate(codec, new BytesDecompressor(getCodec(codec.name)))
  }

  def release(): Unit = {
    compressors.values.foreach(_.release())
    compressors.clear()
    decompressors.values.foreach(_.release())
    decompressors.clear()
  }
}

private[oap] class BytesCompressor(compressionCodec: Option[CompressionCodec]) {

  private lazy val compressedOutBuffer = new ByteArrayOutputStream()
  private lazy val compressor = compressionCodec match {
    case Some(codec) => CodecPool.getCompressor(codec)
    case None => null
  }

  def compress(bytes: Array[Byte]): Array[Byte] = {
    compressionCodec match {
      case Some(codec) =>
        compressedOutBuffer.reset()
        // null compressor for non-native gzip
        if (compressor != null) {
          compressor.reset()
        }
        val cos = codec.createOutputStream(compressedOutBuffer, compressor)
        cos.write(bytes)
        cos.finish()
        cos.close()
        compressedOutBuffer.toByteArray
      case None => bytes
    }
  }

  def release(): Unit = CodecPool.returnCompressor(compressor)
}

private[oap] class BytesDecompressor(compressionCodec: Option[CompressionCodec]) {

  private lazy val decompressor = compressionCodec match {
    case Some(codec) => CodecPool.getDecompressor(codec)
    case None => null
  }

  def decompress(bytes: Array[Byte], uncompressedSize: Int): Array[Byte] = {
    compressionCodec match {
      case Some(codec) =>
        decompressor.reset()
        val cis = codec.createInputStream(new ByteArrayInputStream(bytes), decompressor)
        val decompressed = new Array[Byte](uncompressedSize)
        new DataInputStream(cis).readFully(decompressed)
        decompressed
      case None => bytes
    }
  }

  def release(): Unit = CodecPool.returnDecompressor(decompressor)
} 
Example 3
Source File: SequenceFileRDDFunctions.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.hadoop.io.Writable
import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.SequenceFileOutputFormat

import org.apache.spark.internal.Logging


  def saveAsSequenceFile(
      path: String,
      codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope {
    def anyToWritable[U <% Writable](u: U): Writable = u

    // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and
    // valueWritableClass at the compile time. To implement that, we need to add type parameters to
    // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a
    // breaking change.
    val convertKey = self.keyClass != _keyWritableClass
    val convertValue = self.valueClass != _valueWritableClass

    logInfo("Saving as sequence file of type " +
      s"(${_keyWritableClass.getSimpleName},${_valueWritableClass.getSimpleName})" )
    val format = classOf[SequenceFileOutputFormat[Writable, Writable]]
    val jobConf = new JobConf(self.context.hadoopConfiguration)
    if (!convertKey && !convertValue) {
      self.saveAsHadoopFile(path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    } else if (!convertKey && convertValue) {
      self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile(
        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    } else if (convertKey && !convertValue) {
      self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile(
        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    } else if (convertKey && convertValue) {
      self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile(
        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    }
  }
} 
Example 4
Source File: IOCommon.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.common

import java.io.{File, FileInputStream, IOException, InputStreamReader}
import java.util.Properties

import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, SparkException}

import scala.collection.JavaConversions._
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag

class IOCommon(val sc:SparkContext) {
   def load[T:ClassTag:TypeTag](filename:String, force_format:Option[String]=None) = {
     val input_format = force_format.getOrElse(
       IOCommon.getProperty("sparkbench.inputformat").getOrElse("Text"))

     input_format match {
       case "Text" =>
         sc.textFile(filename)

       case "Sequence" =>
         sc.sequenceFile[NullWritable, Text](filename).map(_._2.toString)

       case _ => throw new UnsupportedOperationException(s"Unknown inpout format: $input_format")
     }
   }

   def save(filename:String, data:RDD[_], prefix:String) = {
     val output_format = IOCommon.getProperty(prefix).getOrElse("Text")
     val output_format_codec =
       loadClassByName[CompressionCodec](IOCommon.getProperty(prefix + ".codec"))

     output_format match {
       case "Text" =>
         if (output_format_codec.isEmpty)  data.saveAsTextFile(filename)
         else data.saveAsTextFile(filename, output_format_codec.get)

       case "Sequence" =>
         val sequence_data = data.map(x => (NullWritable.get(), new Text(x.toString)))
         if (output_format_codec.isEmpty) {
           sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename)
         } else {
           sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename,
             output_format_codec.get)
         }

       case _ => throw new UnsupportedOperationException(s"Unknown output format: $output_format")
     }
   }

   def save(filename:String, data:RDD[_]):Unit = save(filename, data, "sparkbench.outputformat")

   private def loadClassByName[T](name:Option[String]) = {
     if (!name.isEmpty) Some(Class.forName(name.get)
       .newInstance.asInstanceOf[T].getClass) else None
   }

   private def callMethod[T, R](obj:T, method_name:String) =
     obj.getClass.getMethod(method_name).invoke(obj).asInstanceOf[R]
 }

object IOCommon {
   private val sparkbench_conf: HashMap[String, String] =
     getPropertiesFromFile(System.getenv("SPARKBENCH_PROPERTIES_FILES"))

   def getPropertiesFromFile(filenames: String): HashMap[String, String] = {
     val result = new HashMap[String, String]
     filenames.split(',').filter(_.stripMargin.length > 0).foreach { filename =>
       val file = new File(filename)
       require(file.exists, s"Properties file $file does not exist")
       require(file.isFile, s"Properties file $file is not a normal file")

       val inReader = new InputStreamReader(new FileInputStream(file), "UTF-8")
       try {
         val properties = new Properties()
         properties.load(inReader)
         result ++= properties.stringPropertyNames()
           .map(k => (k, properties(k).trim)).toMap
       } catch {
         case e: IOException =>
           val message = s"Failed when loading Sparkbench properties file $file"
           throw new SparkException(message, e)
       } finally {
         inReader.close()
       }
     }
     result.filter{case (key, value) => value.toLowerCase != "none"}
   }

   def getProperty(key:String):Option[String] = sparkbench_conf.get(key)

   def dumpProperties(): Unit = sparkbench_conf
       .foreach{case (key, value)=> println(s"$key\t\t$value")}
 }