Java Code Examples for org.apache.hadoop.io.compress.CompressionCodec#createInputStream()
The following examples show how to use
org.apache.hadoop.io.compress.CompressionCodec#createInputStream() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HadoopFileReader.java From hadoopoffice with Apache License 2.0 | 6 votes |
public InputStream openFile(Path path) throws IOException { CompressionCodec codec=compressionCodecs.getCodec(path); FSDataInputStream fileIn=fs.open(path); // check if compressed if (codec==null) { // uncompressed LOG.debug("Reading from an uncompressed file \""+path+"\""); return fileIn; } else { // compressed Decompressor decompressor = CodecPool.getDecompressor(codec); this.openDecompressors.add(decompressor); // to be returned later using close if (codec instanceof SplittableCompressionCodec) { LOG.debug("Reading from a compressed file \""+path+"\" with splittable compression codec"); long end = fs.getFileStatus(path).getLen(); return ((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS); } else { LOG.debug("Reading from a compressed file \""+path+"\" with non-splittable compression codec"); return codec.createInputStream(fileIn,decompressor); } } }
Example 2
Source File: IFile.java From tez with Apache License 2.0 | 6 votes |
private static InputStream getDecompressedInputStreamWithBufferSize(CompressionCodec codec, IFileInputStream checksumIn, Decompressor decompressor, int compressedLength) throws IOException { String bufferSizeProp = TezRuntimeUtils.getBufferSizeProperty(codec); if (bufferSizeProp != null) { Configurable configurableCodec = (Configurable) codec; Configuration conf = configurableCodec.getConf(); int bufSize = Math.min(compressedLength, DEFAULT_BUFFER_SIZE); LOG.trace("buffer size was set according to min(compressedLength, {}): {}={}", DEFAULT_BUFFER_SIZE, bufferSizeProp, bufSize); conf.setInt(bufferSizeProp, bufSize); } return codec.createInputStream(checksumIn, decompressor); }
Example 3
Source File: MapReduceBitcoinBlockIntegrationTest.java From hadoopcryptoledger with Apache License 2.0 | 6 votes |
private InputStream openFile(Path path) throws IOException { CompressionCodec codec=new CompressionCodecFactory(miniCluster.getConfig()).getCodec(path); FSDataInputStream fileIn=dfsCluster.getFileSystem().open(path); // check if compressed if (codec==null) { // uncompressed return fileIn; } else { // compressed Decompressor decompressor = CodecPool.getDecompressor(codec); this.openDecompressors.add(decompressor); // to be returned later using close if (codec instanceof SplittableCompressionCodec) { long end = dfsCluster.getFileSystem().getFileStatus(path).getLen(); final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, 0, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS); return cIn; } else { return codec.createInputStream(fileIn,decompressor); } } }
Example 4
Source File: CellBlockBuilder.java From hbase with Apache License 2.0 | 6 votes |
private ByteBuffer decompress(CompressionCodec compressor, InputStream cellBlockStream, int osInitialSize) throws IOException { // GZIPCodec fails w/ NPE if no configuration. if (compressor instanceof Configurable) { ((Configurable) compressor).setConf(this.conf); } Decompressor poolDecompressor = CodecPool.getDecompressor(compressor); CompressionInputStream cis = compressor.createInputStream(cellBlockStream, poolDecompressor); ByteBufferOutputStream bbos; try { // TODO: This is ugly. The buffer will be resized on us if we guess wrong. // TODO: Reuse buffers. bbos = new ByteBufferOutputStream(osInitialSize); IOUtils.copy(cis, bbos); bbos.close(); return bbos.getByteBuffer(); } finally { CodecPool.returnDecompressor(poolDecompressor); } }
Example 5
Source File: IFile.java From hadoop-gpu with Apache License 2.0 | 6 votes |
/** * Construct an IFile Reader. * * @param conf Configuration File * @param in The input stream * @param length Length of the data in the stream, including the checksum * bytes. * @param codec codec * @param readsCounter Counter for records read from disk * @throws IOException */ public Reader(Configuration conf, FSDataInputStream in, long length, CompressionCodec codec, Counters.Counter readsCounter) throws IOException { readRecordsCounter = readsCounter; checksumIn = new IFileInputStream(in,length); if (codec != null) { decompressor = CodecPool.getDecompressor(codec); this.in = codec.createInputStream(checksumIn, decompressor); } else { this.in = checksumIn; } this.fileLength = length; if (conf != null) { bufferSize = conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); } }
Example 6
Source File: QseqInputFormat.java From Hadoop-BAM with MIT License | 5 votes |
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
Example 7
Source File: WARCFileReader.java From warc-hadoop with MIT License | 5 votes |
/** * Opens a file for reading. If the filename ends in `.gz`, it is automatically decompressed * on the fly. * @param conf The Hadoop configuration. * @param filePath The Hadoop path to the file that should be read. * @throws IOException */ public WARCFileReader(Configuration conf, Path filePath) throws IOException { FileSystem fs = filePath.getFileSystem(conf); this.fileSize = fs.getFileStatus(filePath).getLen(); logger.info("Reading from " + filePath); CompressionCodec codec = filePath.getName().endsWith(".gz") ? WARCFileWriter.getGzipCodec(conf) : null; byteStream = new CountingInputStream(new BufferedInputStream(fs.open(filePath))); dataStream = new DataInputStream(codec == null ? byteStream : codec.createInputStream(byteStream)); }
Example 8
Source File: HiveColumnCardinalityUpdateJob.java From Kylin with Apache License 2.0 | 5 votes |
private static List<String> readLines(Path location, Configuration conf) throws Exception { FileSystem fileSystem = FileSystem.get(location.toUri(), conf); CompressionCodecFactory factory = new CompressionCodecFactory(conf); FileStatus[] items = fileSystem.listStatus(location); if (items == null) return new ArrayList<String>(); List<String> results = new ArrayList<String>(); for (FileStatus item : items) { // ignoring files like _SUCCESS if (item.getPath().getName().startsWith("_")) { continue; } CompressionCodec codec = factory.getCodec(item.getPath()); InputStream stream = null; // check if we have a compression codec we need to use if (codec != null) { stream = codec.createInputStream(fileSystem.open(item.getPath())); } else { stream = fileSystem.open(item.getPath()); } StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); for (String str : raw.split("\n")) { results.add(str); } } return results; }
Example 9
Source File: DelimitedTextFileReaderWriterFactory.java From secor with Apache License 2.0 | 5 votes |
public DelimitedTextFileReader(LogFilePath path, CompressionCodec codec) throws IOException { Path fsPath = new Path(path.getLogFilePath()); FileSystem fs = FileUtil.getFileSystem(path.getLogFilePath()); InputStream inputStream = fs.open(fsPath); this.mReader = (codec == null) ? new BufferedInputStream(inputStream) : new BufferedInputStream( codec.createInputStream(inputStream, mDecompressor = CodecPool.getDecompressor(codec))); this.mOffset = path.getOffset(); }
Example 10
Source File: HadoopFsHelper.java From incubator-gobblin with Apache License 2.0 | 5 votes |
/** * Returns an {@link InputStream} to the specified file. * <p> * Note: It is the caller's responsibility to close the returned {@link InputStream}. * </p> * * @param path The path to the file to open. * @return An {@link InputStream} for the specified file. * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file. */ @Override public InputStream getFileStream(String path) throws FileBasedHelperException { try { Path p = new Path(path); InputStream in = this.getFileSystem().open(p); // Account for compressed files (e.g. gzip). // https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/input/WholeTextFileRecordReader.scala CompressionCodecFactory factory = new CompressionCodecFactory(this.getFileSystem().getConf()); CompressionCodec codec = factory.getCodec(p); return (codec == null) ? in : codec.createInputStream(in); } catch (IOException e) { throw new FileBasedHelperException("Cannot open file " + path + " due to " + e.getMessage(), e); } }
Example 11
Source File: LineRecordReader.java From RDFS with Apache License 2.0 | 5 votes |
public LineRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int)Math.min((long)Integer.MAX_VALUE, end - start)); } this.pos = start; }
Example 12
Source File: FSImageUtil.java From big-c with Apache License 2.0 | 5 votes |
public static InputStream wrapInputStreamForCompression( Configuration conf, String codec, InputStream in) throws IOException { if (codec.isEmpty()) return in; FSImageCompression compression = FSImageCompression.createCompression( conf, codec); CompressionCodec imageCodec = compression.getImageCodec(); return imageCodec.createInputStream(in); }
Example 13
Source File: CompressionFactoryITCase.java From flink with Apache License 2.0 | 5 votes |
private List<String> readFile(File file, CompressionCodec codec) throws Exception { try ( FileInputStream inputStream = new FileInputStream(file); InputStreamReader readerStream = new InputStreamReader(codec.createInputStream(inputStream)); BufferedReader reader = new BufferedReader(readerStream) ) { return reader.lines().collect(Collectors.toList()); } }
Example 14
Source File: FreightStreamer.java From RDFS with Apache License 2.0 | 5 votes |
private InputStream decompress(Path p, FileSystem srcFs) throws IOException { CompressionCodecFactory factory = new CompressionCodecFactory(getConf()); CompressionCodec codec = factory.getCodec(p); InputStream in = srcFs.open(p); if (codec == null) { throw new IOException("Cannot find codec for " + p); } return codec.createInputStream(in); }
Example 15
Source File: Excel97FileRecordReader.java From components with Apache License 2.0 | 5 votes |
private InputStream createInputStream(Configuration job, final Path file) throws IOException { final FileSystem fs = file.getFileSystem(job); InputStream in = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null != codec) { decompressor = CodecPool.getDecompressor(codec); in = codec.createInputStream(in, decompressor); } return in; }
Example 16
Source File: InterleaveMulti.java From ViraPipe with MIT License | 5 votes |
private static void decompress(FileSystem fs, String in, String outpath) throws IOException { Configuration conf = new Configuration(); CompressionCodecFactory factory = new CompressionCodecFactory(conf); CompressionCodec codec = factory.getCodec(new Path(in)); //Decompressing zip file. InputStream is = codec.createInputStream(fs.open(new Path(in))); OutputStream out = fs.create(new Path(outpath)); //Write decompressed out IOUtils.copyBytes(is, out, conf); is.close(); out.close(); }
Example 17
Source File: TestInsertQuery.java From tajo with Apache License 2.0 | 5 votes |
@Test public final void testInsertOverwritePathWithNonFromQuery() throws Exception { ResultSet res = executeString("insert overwrite into location " + "'/tajo-data/testInsertOverwritePathWithNonFromQuery' " + "USING text WITH ('text.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " + "select 1::INT4, 2.1::FLOAT4, 'test'"); res.close(); FileSystem fs = FileSystem.get(testingCluster.getConfiguration()); Path path = new Path("/tajo-data/testInsertOverwritePathWithNonFromQuery"); assertTrue(fs.exists(path)); assertEquals(1, fs.listStatus(path).length); CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration()); FileStatus file = fs.listStatus(path)[0]; CompressionCodec codec = factory.getCodec(file.getPath()); assertTrue(codec instanceof DeflateCodec); try (BufferedReader reader = new BufferedReader( new InputStreamReader(codec.createInputStream(fs.open(file.getPath()))))) { String line = reader.readLine(); assertNotNull(line); String[] tokens = line.split("\\|"); assertEquals(3, tokens.length); assertEquals("1", tokens[0]); assertEquals("2.1", tokens[1]); assertEquals("test", tokens[2]); } }
Example 18
Source File: LineRecordReader.java From big-c with Apache License 2.0 | 4 votes |
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null!=codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.seek(start); in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
Example 19
Source File: LineRecordReader.java From hadoop with Apache License 2.0 | 4 votes |
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split final FileSystem fs = file.getFileSystem(job); fileIn = fs.open(file); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); if (null!=codec) { isCompressedInput = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec)codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes); filePosition = fileIn; } } else { fileIn.seek(start); in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }
Example 20
Source File: JHLogAnalyzer.java From big-c with Apache License 2.0 | 4 votes |
/** * Collect information about one job. * * @param fs - file system * @param filePath - full path of a history log file * @param offset - starting offset in the history log file * @throws IOException */ public void parseLogFile(FileSystem fs, Path filePath, long offset, OutputCollector<Text, Text> output, Reporter reporter ) throws IOException { InputStream in = null; try { // open file & seek FSDataInputStream stm = fs.open(filePath); stm.seek(offset); in = stm; LOG.info("Opened " + filePath); reporter.setStatus("Opened " + filePath); // get a compression filter if specified if(compressionClass != null) { CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(compressionClass, new Configuration()); in = codec.createInputStream(stm); LOG.info("Codec created " + filePath); reporter.setStatus("Codec created " + filePath); } BufferedReader reader = new BufferedReader(new InputStreamReader(in)); LOG.info("Reader created " + filePath); // skip to the next job log start long processed = 0L; if(jobDelimiterPattern != null) { for(String line = reader.readLine(); line != null; line = reader.readLine()) { if((stm.getPos() - processed) > 100000) { processed = stm.getPos(); reporter.setStatus("Processing " + filePath + " at " + processed); } if(isEndOfJobLog(line)) break; } } // parse lines and update job history JobHistoryLog jh = new JobHistoryLog(); int jobLineCount = 0; for(String line = readLine(reader); line != null; line = readLine(reader)) { jobLineCount++; if((stm.getPos() - processed) > 20000) { processed = stm.getPos(); long numTasks = (jh.tasks == null ? 0 : jh.tasks.size()); String txt = "Processing " + filePath + " at " + processed + " # tasks = " + numTasks; reporter.setStatus(txt); LOG.info(txt); } if(isEndOfJobLog(line)) { if(jh.JOBID != null) { LOG.info("Finished parsing job: " + jh.JOBID + " line count = " + jobLineCount); collectJobStats(jh, output, reporter); LOG.info("Collected stats for job: " + jh.JOBID); } jh = new JobHistoryLog(); jobLineCount = 0; } else jh.parseLine(line); } if(jh.JOBID == null) { LOG.error("JOBID = NULL in " + filePath + " at " + processed); return; } collectJobStats(jh, output, reporter); } catch(Exception ie) { // parsing errors can happen if the file has been truncated LOG.error("JHLAMapper.parseLogFile", ie); reporter.setStatus("JHLAMapper.parseLogFile failed " + StringUtils.stringifyException(ie)); throw new IOException("Job failed.", ie); } finally { if(in != null) in.close(); } }