org.apache.hadoop.io.compress.GzipCodec Scala Examples

The following examples show how to use org.apache.hadoop.io.compress.GzipCodec. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala    From drizzle-spark   with Apache License 2.0 12 votes vote down vote up
package org.apache.spark.sql.execution.datasources.text

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.util.CompressionCodecs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.util.SerializableConfiguration


  def getCompressionExtension(context: TaskAttemptContext): String = {
    // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile
    if (FileOutputFormat.getCompressOutput(context)) {
      val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec])
      ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension
    } else {
      ""
    }
  }
} 
Example 2
Source File: package.scala    From sparkpipe-core   with Apache License 2.0 5 votes vote down vote up
package software.uncharted.sparkpipe.ops.core.rdd

import org.apache.hadoop.io.compress.{GzipCodec, BZip2Codec}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


  def write[T] (
    path: String,
    format: String = TEXT_FORMAT,
    options: Map[String, String] = Map[String, String]()
  )(input: RDD[T]): RDD[T] = {
    assert(TEXT_FORMAT == format, "Only text format currently supported")
    options.get(CODEC).map(_.trim.toLowerCase) match {
      case Some(BZIP2_CODEC) =>
        input.saveAsTextFile(path, classOf[BZip2Codec])
      case Some(GZIP_CODEC) =>
        input.saveAsTextFile(path, classOf[GzipCodec])
      case _ =>
        input.saveAsTextFile(path)
    }
    input
  }
}