org.apache.hadoop.io.compress.CompressionCodecFactory Scala Examples
The following examples show how to use org.apache.hadoop.io.compress.CompressionCodecFactory.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: WholeTextFileRecordReader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 2
Source File: FileStreamRecordReader.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.store.hadoop.formats import org.apache.hadoop.fs._ import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input._ import java.io.InputStream class FileStreamRecordReader[K, V](read: InputStream => (K, V)) extends RecordReader[K, V] { private var tup: (K, V) = null private var hasNext: Boolean = true def initialize(split: InputSplit, context: TaskAttemptContext) = { val path = split.asInstanceOf[FileSplit].getPath() val conf = context.getConfiguration() val fs = path.getFileSystem(conf) val is: InputStream = { val factory = new CompressionCodecFactory(conf) val codec = factory.getCodec(path) if (codec == null) fs.open(path) else codec.createInputStream(fs.open(path)) } tup = read(is) } def close = {} def getCurrentKey = tup._1 def getCurrentValue = { hasNext = false ; tup._2 } def getProgress = 1 def nextKeyValue = hasNext } trait FileStreamInputFormat[K, V] extends FileInputFormat[K, V] { def read(is: InputStream, context: TaskAttemptContext): (K, V) override def isSplitable(context: JobContext, fileName: Path) = false override def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[K, V] = new FileStreamRecordReader({ is => read(is, context) }) }
Example 3
Source File: WholeFileReader.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.mapreduce import java.io.InputStream import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.hadoop.io.compress.{CodecPool, CompressionCodecFactory, Decompressor} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapreduce.lib.input.FileSplit import org.apache.hadoop.mapreduce.{InputSplit, RecordReader, TaskAttemptContext} class WholeFileReader extends RecordReader[NullWritable, Text] { private val key = NullWritable.get() private val value = new Text() private var split: FileSplit = _ private var conf: Configuration = _ private var path: Path = _ private var done: Boolean = false override def getProgress: Float = ??? override def nextKeyValue(): Boolean = { if (done){ false } else { val fs = path.getFileSystem(conf) var is: FSDataInputStream = null var in: InputStream = null var decompressor: Decompressor = null try { is = fs.open(split.getPath) val codec = new CompressionCodecFactory(conf).getCodec(path) if (codec != null) { decompressor = CodecPool.getDecompressor(codec) in = codec.createInputStream(is, decompressor) } else { in = is } val result = IOUtils.toByteArray(in) value.clear() value.set(result) done = true true } finally { if (in != null) { IOUtils.closeQuietly(in) } if (decompressor != null) { CodecPool.returnDecompressor(decompressor) } } } } override def getCurrentValue: Text = value override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext): Unit = { this.split = inputSplit.asInstanceOf[FileSplit] this.conf = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) this.path = this.split.getPath } override def getCurrentKey: NullWritable = key override def close() {} }
Example 4
Source File: WholeTextFileRecordReader.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 5
Source File: WholeTextFileRecordReader.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 6
Source File: GzipDecompressor.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.algo import java.util.concurrent.{Executors, TimeUnit} import com.adidas.analytics.algo.GzipDecompressor.{changeFileExtension, compressedExtension, _} import com.adidas.analytics.algo.core.JobRunner import com.adidas.analytics.config.GzipDecompressorConfiguration import com.adidas.analytics.util.DFSWrapper import com.adidas.analytics.util.DFSWrapper._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.IOUtils import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.spark.sql.SparkSession import org.slf4j.{Logger, LoggerFactory} import scala.concurrent._ import scala.concurrent.duration._ final class GzipDecompressor protected(val spark: SparkSession, val dfs: DFSWrapper, val configLocation: String) extends JobRunner with GzipDecompressorConfiguration { private val hadoopConfiguration: Configuration = spark.sparkContext.hadoopConfiguration private val fileSystem: FileSystem = dfs.getFileSystem(inputDirectoryPath) override def run(): Unit = { //check if directory exists if (!fileSystem.exists(inputDirectoryPath)){ logger.error(s"Input directory: $inputDirectoryPath does not exist.") throw new RuntimeException(s"Directory $inputDirectoryPath does not exist.") } val compressedFilePaths = fileSystem.ls(inputDirectoryPath, recursive) .filterNot(path => fileSystem.isDirectory(path)) .filter(_.getName.toLowerCase.endsWith(compressedExtension)) if (compressedFilePaths.isEmpty) { logger.warn(s"Input directory $inputDirectoryPath does not contain compressed files. Skipping...") } else { implicit val ec: ExecutionContext = ExecutionContext.fromExecutor(Executors.newFixedThreadPool(threadPoolSize)) Await.result(Future.sequence( compressedFilePaths.map { compressedFilePath => Future { logger.info(s"Decompressing file: $compressedFilePath") val decompressedFileName = changeFileExtension(compressedFilePath.getName, compressedExtension, outputExtension) val decompressedFilePath = new Path(compressedFilePath.getParent, decompressedFileName) val compressionCodecFactory = new CompressionCodecFactory(hadoopConfiguration) val inputCodec = compressionCodecFactory.getCodec(compressedFilePath) val inputStream = inputCodec.createInputStream(fileSystem.open(compressedFilePath)) val output = fileSystem.create(decompressedFilePath) IOUtils.copyBytes(inputStream, output, hadoopConfiguration) logger.info(s"Finished decompressing file: $compressedFilePath") //Delete the compressed file fileSystem.delete(compressedFilePath, false) logger.info(s"Removed file: $compressedFilePath") } } ), Duration(4, TimeUnit.HOURS)) } } } object GzipDecompressor { private val logger: Logger = LoggerFactory.getLogger(this.getClass) private val compressedExtension: String = ".gz" def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): GzipDecompressor = { new GzipDecompressor(spark, dfs, configLocation) } private def changeFileExtension(fileName: String, currentExt: String, newExt: String): String = { val newFileName = fileName.substring(0, fileName.lastIndexOf(currentExt)) if (newFileName.endsWith(newExt)) newFileName else newFileName + newExt } }
Example 7
Source File: WholeTextFileRecordReader.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 8
Source File: WholeTextFileRecordReader.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 9
Source File: WholeTextFileRecordReader.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 10
Source File: WholeTextFileRecordReader.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.input import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.conf.{Configurable => HConfigurable, Configuration} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.input.{CombineFileRecordReader, CombineFileSplit} private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 11
Source File: WholeTextFileRecordReader.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }