org.apache.hadoop.io.compress.CompressionCodec Scala Example

Source File: SequenceFileRDDFunctions.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.{classTag, ClassTag}

import org.apache.hadoop.io.Writable
import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.SequenceFileOutputFormat

import org.apache.spark.internal.Logging


  def saveAsSequenceFile(
      path: String,
      codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope {
    def anyToWritable[U <% Writable](u: U): Writable = u

    // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and
    // valueWritableClass at the compile time. To implement that, we need to add type parameters to
    // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a
    // breaking change.
    val convertKey = self.keyClass != keyWritableClass
    val convertValue = self.valueClass != valueWritableClass

    logInfo("Saving as sequence file of type (" + keyWritableClass.getSimpleName + "," +
      valueWritableClass.getSimpleName + ")" )
    val format = classOf[SequenceFileOutputFormat[Writable, Writable]]
    val jobConf = new JobConf(self.context.hadoopConfiguration)
    if (!convertKey && !convertValue) {
      self.saveAsHadoopFile(path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    } else if (!convertKey && convertValue) {
      self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile(
        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    } else if (convertKey && !convertValue) {
      self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile(
        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    } else if (convertKey && convertValue) {
      self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile(
        path, keyWritableClass, valueWritableClass, format, jobConf, codec)
    }
  }
}

Source File: CodecFactory.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream}

import scala.collection.mutable

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.compress.{CodecPool, CompressionCodec}
import org.apache.hadoop.util.ReflectionUtils
import org.apache.parquet.format.{CompressionCodec => ParquetCodec}
import org.apache.parquet.hadoop.metadata.CompressionCodecName

// This is a simple version of parquet's CodeFactory.
// TODO: [linhong] Need change this into Scala Code style
private[oap] class CodecFactory(conf: Configuration) {

  private val compressors = new mutable.HashMap[ParquetCodec, BytesCompressor]
  private val decompressors = new mutable.HashMap[ParquetCodec, BytesDecompressor]
  private val codecByName = new mutable.HashMap[String, CompressionCodec]

  private def getCodec(codecString: String): Option[CompressionCodec] = {
    codecByName.get(codecString) match {
      case Some(codec) => Some(codec)
      case None =>
        val codecName = CompressionCodecName.valueOf(codecString)
        val codecClass = codecName.getHadoopCompressionCodecClass
        if (codecClass == null) {
          None
        } else {
          val codec = ReflectionUtils.newInstance(codecClass, conf).asInstanceOf[CompressionCodec]
          codecByName.put(codecString, codec)
          Some(codec)
        }
    }
  }

  def getCompressor(codec: ParquetCodec): BytesCompressor = {
    compressors.getOrElseUpdate(codec, new BytesCompressor(getCodec(codec.name)))
  }

  def getDecompressor(codec: ParquetCodec): BytesDecompressor = {
    decompressors.getOrElseUpdate(codec, new BytesDecompressor(getCodec(codec.name)))
  }

  def release(): Unit = {
    compressors.values.foreach(_.release())
    compressors.clear()
    decompressors.values.foreach(_.release())
    decompressors.clear()
  }
}

private[oap] class BytesCompressor(compressionCodec: Option[CompressionCodec]) {

  private lazy val compressedOutBuffer = new ByteArrayOutputStream()
  private lazy val compressor = compressionCodec match {
    case Some(codec) => CodecPool.getCompressor(codec)
    case None => null
  }

  def compress(bytes: Array[Byte]): Array[Byte] = {
    compressionCodec match {
      case Some(codec) =>
        compressedOutBuffer.reset()
        // null compressor for non-native gzip
        if (compressor != null) {
          compressor.reset()
        }
        val cos = codec.createOutputStream(compressedOutBuffer, compressor)
        cos.write(bytes)
        cos.finish()
        cos.close()
        compressedOutBuffer.toByteArray
      case None => bytes
    }
  }

  def release(): Unit = CodecPool.returnCompressor(compressor)
}

private[oap] class BytesDecompressor(compressionCodec: Option[CompressionCodec]) {

  private lazy val decompressor = compressionCodec match {
    case Some(codec) => CodecPool.getDecompressor(codec)
    case None => null
  }

  def decompress(bytes: Array[Byte], uncompressedSize: Int): Array[Byte] = {
    compressionCodec match {
      case Some(codec) =>
        decompressor.reset()
        val cis = codec.createInputStream(new ByteArrayInputStream(bytes), decompressor)
        val decompressed = new Array[Byte](uncompressedSize)
        new DataInputStream(cis).readFully(decompressed)
        decompressed
      case None => bytes
    }
  }

  def release(): Unit = CodecPool.returnDecompressor(decompressor)
}

Source File: SequenceFileRDDFunctions.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.hadoop.io.Writable
import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapred.SequenceFileOutputFormat

import org.apache.spark.internal.Logging


  def saveAsSequenceFile(
      path: String,
      codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope {
    def anyToWritable[U <% Writable](u: U): Writable = u

    // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and
    // valueWritableClass at the compile time. To implement that, we need to add type parameters to
    // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a
    // breaking change.
    val convertKey = self.keyClass != _keyWritableClass
    val convertValue = self.valueClass != _valueWritableClass

    logInfo("Saving as sequence file of type " +
      s"(${_keyWritableClass.getSimpleName},${_valueWritableClass.getSimpleName})" )
    val format = classOf[SequenceFileOutputFormat[Writable, Writable]]
    val jobConf = new JobConf(self.context.hadoopConfiguration)
    if (!convertKey && !convertValue) {
      self.saveAsHadoopFile(path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    } else if (!convertKey && convertValue) {
      self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile(
        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    } else if (convertKey && !convertValue) {
      self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile(
        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    } else if (convertKey && convertValue) {
      self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile(
        path, _keyWritableClass, _valueWritableClass, format, jobConf, codec)
    }
  }
}

Source File: IOCommon.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.common

import java.io.{File, FileInputStream, IOException, InputStreamReader}
import java.util.Properties

import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, SparkException}

import scala.collection.JavaConversions._
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag

class IOCommon(val sc:SparkContext) {
   def load[T:ClassTag:TypeTag](filename:String, force_format:Option[String]=None) = {
     val input_format = force_format.getOrElse(
       IOCommon.getProperty("sparkbench.inputformat").getOrElse("Text"))

     input_format match {
       case "Text" =>
         sc.textFile(filename)

       case "Sequence" =>
         sc.sequenceFile[NullWritable, Text](filename).map(_._2.toString)

       case _ => throw new UnsupportedOperationException(s"Unknown inpout format: $input_format")
     }
   }

   def save(filename:String, data:RDD[_], prefix:String) = {
     val output_format = IOCommon.getProperty(prefix).getOrElse("Text")
     val output_format_codec =
       loadClassByName[CompressionCodec](IOCommon.getProperty(prefix + ".codec"))

     output_format match {
       case "Text" =>
         if (output_format_codec.isEmpty)  data.saveAsTextFile(filename)
         else data.saveAsTextFile(filename, output_format_codec.get)

       case "Sequence" =>
         val sequence_data = data.map(x => (NullWritable.get(), new Text(x.toString)))
         if (output_format_codec.isEmpty) {
           sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename)
         } else {
           sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename,
             output_format_codec.get)
         }

       case _ => throw new UnsupportedOperationException(s"Unknown output format: $output_format")
     }
   }

   def save(filename:String, data:RDD[_]):Unit = save(filename, data, "sparkbench.outputformat")

   private def loadClassByName[T](name:Option[String]) = {
     if (!name.isEmpty) Some(Class.forName(name.get)
       .newInstance.asInstanceOf[T].getClass) else None
   }

   private def callMethod[T, R](obj:T, method_name:String) =
     obj.getClass.getMethod(method_name).invoke(obj).asInstanceOf[R]
 }

object IOCommon {
   private val sparkbench_conf: HashMap[String, String] =
     getPropertiesFromFile(System.getenv("SPARKBENCH_PROPERTIES_FILES"))

   def getPropertiesFromFile(filenames: String): HashMap[String, String] = {
     val result = new HashMap[String, String]
     filenames.split(',').filter(_.stripMargin.length > 0).foreach { filename =>
       val file = new File(filename)
       require(file.exists, s"Properties file $file does not exist")
       require(file.isFile, s"Properties file $file is not a normal file")

       val inReader = new InputStreamReader(new FileInputStream(file), "UTF-8")
       try {
         val properties = new Properties()
         properties.load(inReader)
         result ++= properties.stringPropertyNames()
           .map(k => (k, properties(k).trim)).toMap
       } catch {
         case e: IOException =>
           val message = s"Failed when loading Sparkbench properties file $file"
           throw new SparkException(message, e)
       } finally {
         inReader.close()
       }
     }
     result.filter{case (key, value) => value.toLowerCase != "none"}
   }

   def getProperty(key:String):Option[String] = sparkbench_conf.get(key)

   def dumpProperties(): Unit = sparkbench_conf
       .foreach{case (key, value)=> println(s"$key\t\t$value")}
 }

org.apache.hadoop.io.compress.CompressionCodec Scala Examples