org.apache.hadoop.io.compress.BZip2Codec Java Exaples

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readBitcoinTransactionInputFormatBzip2Compressed() throws IOException, InterruptedException {
Configuration conf = new Configuration(defaultConf);
   Job job = Job.getInstance(conf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, conf);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinTransactionFileInputFormat format = new BitcoinTransactionFileInputFormat();
   List<InputSplit> splits = format.getSplits(job);
   TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
   assertEquals( 1, splits.size(),"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BitcoinTransaction> reader = format.createRecordReader(splits.get(0), context);
assertNotNull( reader,"Format returned  null RecordReader");
reader.initialize(splits.get(0),context);
int transactCount=0;
while (reader.nextKeyValue()) {
	transactCount++;
}
	assertEquals( 936, transactCount,"Compressed block must have at least 936 transactions");
reader.close();
 }

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readBitcoinRawBlockInputFormatBzip2Compressed() throws IOException, InterruptedException {
Configuration conf = new Configuration(defaultConf);
   Job job = Job.getInstance(conf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, conf);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
   List<InputSplit> splits = format.getSplits(job);
   TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
   assertEquals( 1, splits.size(),"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BytesWritable> reader = format.createRecordReader(splits.get(0), context);
assertNotNull( reader,"Format returned  null RecordReader");
reader.initialize(splits.get(0),context);
BytesWritable key = new BytesWritable();	
BytesWritable block = new BytesWritable();
assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block");
block=reader.getCurrentValue();
assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes");
   	assertFalse( reader.nextKeyValue(),"No further blocks in compressed block");
reader.close();
 }

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readBitcoinTransactionInputFormatBzip2Compressed() throws IOException {
     JobConf job = new JobConf(defaultConf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, job);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinTransactionFileInputFormat format = new BitcoinTransactionFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
   assertEquals( 1, inputSplits.length,"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BitcoinTransaction> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
BitcoinTransaction transaction = new BitcoinTransaction();
int transactCount=0;
while (reader.next(key,transaction)) {
	transactCount++;
}
	assertEquals( 936, transactCount,"Compressed block must have at least 936 transactions");
reader.close();
 }

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readBitcoinRawBlockInputFormatBzip2Compressed() throws IOException {
   JobConf job = new JobConf(defaultConf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, job);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinRawBlockFileInputFormat format = new BitcoinRawBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
   assertEquals( 1, inputSplits.length,"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BytesWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
BytesWritable block = new BytesWritable();
assertTrue( reader.next(key,block),"Input Split for block version contains at least one block");
assertEquals( 998039, block.getLength(),"Compressed block must have a size of 998.039 bytes");
BytesWritable emptyKey = new BytesWritable();
   	BytesWritable emptyBlock = new BytesWritable();
   	assertFalse( reader.next(emptyKey,emptyBlock),"No further blocks in compressed block");
reader.close();
 }

Source File: S3SelectPushdown.java From presto with Apache License 2.0

5 votes

public static boolean isCompressionCodecSupported(InputFormat<?, ?> inputFormat, Path path)
{
    if (inputFormat instanceof TextInputFormat) {
        return getCompressionCodec((TextInputFormat) inputFormat, path)
                .map(codec -> (codec instanceof GzipCodec) || (codec instanceof BZip2Codec))
                .orElse(false); // TODO (https://github.com/prestosql/presto/issues/2475) fix S3 Select when file not compressed
    }

    return false;
}

Source File: MultiStorage.java From spork with Apache License 2.0

5 votes

@Override
public void setStoreLocation(String location, Job job) throws IOException {
  job.getConfiguration().set(MRConfiguration.TEXTOUTPUTFORMAT_SEPARATOR, "");
  FileOutputFormat.setOutputPath(job, new Path(location));
  if (comp == Compression.bz2 || comp == Compression.bz) {
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job,  BZip2Codec.class);
  } else if (comp == Compression.gz) {
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
  }
}

Source File: PigStorage.java From spork with Apache License 2.0

5 votes

private void setCompression(Path path, Job job) {
 	String location=path.getName();
    if (location.endsWith(".bz2") || location.endsWith(".bz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job,  BZip2Codec.class);
    }  else if (location.endsWith(".gz")) {
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    } else {
        FileOutputFormat.setCompressOutput( job, false);
    }
}

Source File: TestExtractor.java From sqoop-on-spark with Apache License 2.0

5 votes

@DataProvider(name="test-hdfs-extractor")
public static Object[][] data() {
  List<Object[]> parameters = new ArrayList<Object[]>();
  for (Class<?> compressionClass : new Class<?>[]{null, DefaultCodec.class, BZip2Codec.class}) {
    for (Object outputFileType : new Object[]{TEXT_FILE, SEQUENCE_FILE}) {
      parameters.add(new Object[]{outputFileType, compressionClass});
    }
  }
  return parameters.toArray(new Object[0][]);
}

Source File: TestPartitioner.java From sqoop-on-spark with Apache License 2.0

5 votes

@DataProvider(name="test-hdfs-partitioner")
public static Object[][] data() {
  List<Object[]> parameters = new ArrayList<Object[]>();
  for (Class<?> compressionClass : new Class<?>[]{null, DefaultCodec.class, BZip2Codec.class}) {
    for (Object outputFileType : new Object[]{TEXT_FILE, SEQUENCE_FILE}) {
      parameters.add(new Object[]{outputFileType, compressionClass});
    }
  }
  return parameters.toArray(new Object[0][]);
}

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

5 votes

@Test
 public void readBitcoinBlockInputFormatBzip2Compressed() throws IOException, InterruptedException {
Configuration conf = new Configuration(defaultConf);
   Job job = Job.getInstance(conf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, conf);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinBlockFileInputFormat format = new BitcoinBlockFileInputFormat();
   List<InputSplit> splits = format.getSplits(job);
   TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
   assertEquals( 1, splits.size(),"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BitcoinBlock> reader = format.createRecordReader(splits.get(0), context);
assertNotNull( reader,"Format returned  null RecordReader");
reader.initialize(splits.get(0),context);
BytesWritable key = new BytesWritable();	
BitcoinBlock block = new BitcoinBlock();
assertTrue( reader.nextKeyValue(),"Input Split for block version contains at least one block");
block=reader.getCurrentValue();
assertEquals( 936, block.getTransactions().size(),"Compressed block must have at least 936 transactions");
assertEquals( 4, block.getTransactions().get(0).getListOfInputs().get(0).getTxInScript().length,"Compressed block must contain exactly 936 transactions of which the first has one input and script length 4");
assertEquals( 2, block.getTransactions().get(0).getListOfOutputs().size(),"Compressed block must contain exactly 936 transactions of which the first has two outputs");
assertEquals( 25, block.getTransactions().get(0).getListOfOutputs().get(0).getTxOutScript().length,"Compressed block must contain exactly 936 transactions of which the first has two output and the first output script length 25");
   	assertFalse( reader.nextKeyValue(),"No further blocks in compressed block");
reader.close();
 }

Source File: BitcoinFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

5 votes

@Test
 public void readBitcoinBlockInputFormatBzip2Compressed() throws IOException {
   JobConf job = new JobConf(defaultConf);
   CompressionCodec bzip2 = new BZip2Codec();
   ReflectionUtils.setConf(bzip2, job);
   ClassLoader classLoader = getClass().getClassLoader();
   String fileName="version4comp.blk.bz2";
   String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
   Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   BitcoinBlockFileInputFormat format = new BitcoinBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
   assertEquals( 1, inputSplits.length,"Only one split generated for compressed block");
   	RecordReader<BytesWritable, BitcoinBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");
BytesWritable key = new BytesWritable();	
BitcoinBlock block = new BitcoinBlock();
assertTrue( reader.next(key,block),"Input Split for block version contains at least one block");
assertEquals( 936, block.getTransactions().size(),"Compressed block must have at least 936 transactions");
assertEquals( 4, block.getTransactions().get(0).getListOfInputs().get(0).getTxInScript().length,"Compressed block must contain exactly 936 transactions of which the first has one input and script length 4");
assertEquals( 2, block.getTransactions().get(0).getListOfOutputs().size(),"Compressed block must contain exactly 936 transactions of which the first has two outputs");
assertEquals( 25, block.getTransactions().get(0).getListOfOutputs().get(0).getTxOutScript().length,"Compressed block must contain exactly 936 transactions of which the first has two output and the first output script length 25");
BytesWritable emptyKey = new BytesWritable();
   	BitcoinBlock emptyBlock = new BitcoinBlock();
   	assertFalse( reader.next(emptyKey,emptyBlock),"No further blocks in compressed block");
reader.close();
 }

Source File: TestLineRecordReader.java From big-c with Apache License 2.0

5 votes

@Test
public void testMultipleClose() throws IOException {
  URL testFileUrl = getClass().getClassLoader().
      getResource("recordSpanningMultipleSplits.txt.bz2");
  assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2",
      testFileUrl);
  File testFile = new File(testFileUrl.getFile());
  Path testFilePath = new Path(testFile.getAbsolutePath());
  long testFileSize = testFile.length();
  Configuration conf = new Configuration();
  conf.setInt(org.apache.hadoop.mapreduce.lib.input.
      LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

  // read the data and check whether BOM is skipped
  FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null);
  LineRecordReader reader = new LineRecordReader();
  reader.initialize(split, context);

  //noinspection StatementWithEmptyBody
  while (reader.nextKeyValue()) ;
  reader.close();
  reader.close();

  BZip2Codec codec = new BZip2Codec();
  codec.setConf(conf);
  Set<Decompressor> decompressors = new HashSet<Decompressor>();
  for (int i = 0; i < 10; ++i) {
    decompressors.add(CodecPool.getDecompressor(codec));
  }
  assertEquals(10, decompressors.size());
}

Source File: TestLineRecordReader.java From big-c with Apache License 2.0

5 votes

@Test
public void testMultipleClose() throws IOException {
  URL testFileUrl = getClass().getClassLoader().
      getResource("recordSpanningMultipleSplits.txt.bz2");
  assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2",
      testFileUrl);
  File testFile = new File(testFileUrl.getFile());
  Path testFilePath = new Path(testFile.getAbsolutePath());
  long testFileSize = testFile.length();
  Configuration conf = new Configuration();
  conf.setInt(org.apache.hadoop.mapreduce.lib.input.
      LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
  FileSplit split = new FileSplit(testFilePath, 0, testFileSize,
      (String[])null);

  LineRecordReader reader = new LineRecordReader(conf, split);
  LongWritable key = new LongWritable();
  Text value = new Text();
  //noinspection StatementWithEmptyBody
  while (reader.next(key, value)) ;
  reader.close();
  reader.close();

  BZip2Codec codec = new BZip2Codec();
  codec.setConf(conf);
  Set<Decompressor> decompressors = new HashSet<Decompressor>();
  for (int i = 0; i < 10; ++i) {
    decompressors.add(CodecPool.getDecompressor(codec));
  }
  assertEquals(10, decompressors.size());
}

Source File: TestLineRecordReader.java From hadoop with Apache License 2.0

5 votes

@Test
public void testMultipleClose() throws IOException {
  URL testFileUrl = getClass().getClassLoader().
      getResource("recordSpanningMultipleSplits.txt.bz2");
  assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2",
      testFileUrl);
  File testFile = new File(testFileUrl.getFile());
  Path testFilePath = new Path(testFile.getAbsolutePath());
  long testFileSize = testFile.length();
  Configuration conf = new Configuration();
  conf.setInt(org.apache.hadoop.mapreduce.lib.input.
      LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

  // read the data and check whether BOM is skipped
  FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null);
  LineRecordReader reader = new LineRecordReader();
  reader.initialize(split, context);

  //noinspection StatementWithEmptyBody
  while (reader.nextKeyValue()) ;
  reader.close();
  reader.close();

  BZip2Codec codec = new BZip2Codec();
  codec.setConf(conf);
  Set<Decompressor> decompressors = new HashSet<Decompressor>();
  for (int i = 0; i < 10; ++i) {
    decompressors.add(CodecPool.getDecompressor(codec));
  }
  assertEquals(10, decompressors.size());
}

Source File: TestLineRecordReader.java From hadoop with Apache License 2.0

5 votes

@Test
public void testMultipleClose() throws IOException {
  URL testFileUrl = getClass().getClassLoader().
      getResource("recordSpanningMultipleSplits.txt.bz2");
  assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2",
      testFileUrl);
  File testFile = new File(testFileUrl.getFile());
  Path testFilePath = new Path(testFile.getAbsolutePath());
  long testFileSize = testFile.length();
  Configuration conf = new Configuration();
  conf.setInt(org.apache.hadoop.mapreduce.lib.input.
      LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
  FileSplit split = new FileSplit(testFilePath, 0, testFileSize,
      (String[])null);

  LineRecordReader reader = new LineRecordReader(conf, split);
  LongWritable key = new LongWritable();
  Text value = new Text();
  //noinspection StatementWithEmptyBody
  while (reader.next(key, value)) ;
  reader.close();
  reader.close();

  BZip2Codec codec = new BZip2Codec();
  codec.setConf(conf);
  Set<Decompressor> decompressors = new HashSet<Decompressor>();
  for (int i = 0; i < 10; ++i) {
    decompressors.add(CodecPool.getDecompressor(codec));
  }
  assertEquals(10, decompressors.size());
}

Source File: HdfsSink2.java From sylph with Apache License 2.0

5 votes

public HdfsSink2(Hdfs2SinkConfig config)
        throws ClassNotFoundException
{
    this.batchSize = config.getBatchBufferSize();
    this.writerDir = config.getWriteDir();
    switch (config.getZipType().trim().toLowerCase()) {
        case "lzo":
            codecClass = (Class<? extends CompressionCodec>) Class.forName("com.hadoop.compression.lzo.LzopCodec");
            break;
        case "lz4":
            codecClass = Lz4Codec.class;
            break;
        case "snappy":
            codecClass = SnappyCodec.class;
            break;
        case "gzip":
            codecClass = GzipCodec.class;
            break;
        case "bzip2":
            codecClass = BZip2Codec.class;
            break;
        case "default":
            codecClass = DefaultCodec.class;
            break;
        default:
            codecClass = NoneCodec.class;
    }
}

Source File: CodecFactory.java From pxf with Apache License 2.0

5 votes

/**
 * Determine whether a given compression codec is safe for multiple concurrent threads
 *
 * @param compCodec     the user-given COMPRESSION_CODEC, may be null
 * @param dataSource    the file that we are accessing
 * @param configuration HDFS config
 * @return true only if it's thread safe
 */
public boolean isCodecThreadSafe(String compCodec, String dataSource, Configuration configuration) {
    Class<? extends CompressionCodec> codecClass = null;
    if (compCodec == null) {
        // check for file extensions indicating bzip2 (Text only)
        // currently doesn't check for bzip2 in .avro files
        codecClass = getCodecClassByPath(configuration, dataSource);
    }
    // make sure bzip2 is not the codec
    return !( "bzip2".equalsIgnoreCase(compCodec) ||
            BZip2Codec.class.getName().equalsIgnoreCase(compCodec) ||
            (codecClass != null && BZip2Codec.class.isAssignableFrom(codecClass))
    );
}

Source File: AbstractHadoopProcessor.java From localization_nifi with Apache License 2.0

5 votes

@Override
public String toString() {
    switch (this) {
        case NONE: return "NONE";
        case DEFAULT: return DefaultCodec.class.getName();
        case BZIP: return BZip2Codec.class.getName();
        case GZIP: return GzipCodec.class.getName();
        case LZ4: return Lz4Codec.class.getName();
        case SNAPPY: return SnappyCodec.class.getName();
        case AUTOMATIC: return "Automatically Detected";
    }
    return null;
}

Source File: S3SelectLineRecordReader.java From presto with Apache License 2.0

5 votes

protected CompressionType getCompressionType(Path path)
{
    CompressionCodec codec = compressionCodecFactory.getCodec(path);
    if (codec == null) {
        return CompressionType.NONE;
    }
    if (codec instanceof GzipCodec) {
        return CompressionType.GZIP;
    }
    if (codec instanceof BZip2Codec) {
        return CompressionType.BZIP2;
    }
    throw new PrestoException(NOT_SUPPORTED, "Compression extension not supported for S3 Select: " + path);
}

Source File: TestCompression.java From aliyun-maxcompute-data-collectors with Apache License 2.0

4 votes

public void testBzip2SequenceFileCompression() throws Exception {
  runSequenceFileCompressionTest(new BZip2Codec(), 4);
}

Source File: TestCompression.java From aliyun-maxcompute-data-collectors with Apache License 2.0

4 votes

public void testBzip2TextCompression() throws IOException {
  runTextCompressionTest(new BZip2Codec(), 4);
}

Source File: TestCreateHadoopSequenceFile.java From localization_nifi with Apache License 2.0

4 votes

@Test
public void testSequenceFileBzipCompressionCodec() throws UnsupportedEncodingException, IOException {

    controller.setProperty(AbstractHadoopProcessor.COMPRESSION_CODEC, AbstractHadoopProcessor.CompressionType.BZIP.name());
    controller.setProperty(CreateHadoopSequenceFile.COMPRESSION_TYPE, SequenceFile.CompressionType.BLOCK.name());

    File inFile = inFiles[0];
    try (FileInputStream fin = new FileInputStream(inFile) ){
        controller.enqueue(fin);
    }
    controller.run();

    List<MockFlowFile> successSeqFiles = controller.getFlowFilesForRelationship(CreateHadoopSequenceFile.RELATIONSHIP_SUCCESS);
    List<MockFlowFile> failedFlowFiles = controller.getFlowFilesForRelationship(CreateHadoopSequenceFile.RELATIONSHIP_FAILURE);

    assertEquals(0, failedFlowFiles.size());
    assertEquals(1, successSeqFiles.size());

    MockFlowFile ff = successSeqFiles.iterator().next();
    byte[] data = ff.toByteArray();


    final String magicHeader = new String(data, 0, 3, "UTF-8");
    assertEquals("SEQ", magicHeader);
    // Format of header is SEQ followed by the version (1 byte).
    // Then, the length of the Key type (1 byte), then the Key type
    // Then, the length of the Value type(1 byte), then the Value type
    final String keyType = Text.class.getCanonicalName();
    final int valueTypeStart = 3 + 1 + 1 + keyType.length() + 1;
    final int valueTypeLength = data[5 + keyType.length()];
    final String valueType = BytesWritable.class.getCanonicalName();

    assertEquals(valueType.length(), valueTypeLength);
    assertEquals(valueType, new String(data, valueTypeStart, valueType.length(), "UTF-8"));

    final int compressionIndex = 3 + 1 + 1 + keyType.length() + 1 + valueType.length();
    final int blockCompressionIndex = compressionIndex + 1;

    assertEquals(1, data[compressionIndex]);
    assertEquals(1, data[blockCompressionIndex]);

    final int codecTypeSize = data[blockCompressionIndex + 1];
    final int codecTypeStartIndex = blockCompressionIndex + 2;

    assertEquals(BZip2Codec.class.getCanonicalName(), new String(data, codecTypeStartIndex, codecTypeSize, "UTF-8"));
}

Source File: RcFileTester.java From presto with Apache License 2.0

4 votes

@Override
Optional<String> getCodecName()
{
    return Optional.of(BZip2Codec.class.getName());
}

Source File: TestCreateHadoopSequenceFile.java From nifi with Apache License 2.0

4 votes

@Test
public void testSequenceFileBzipCompressionCodec() throws UnsupportedEncodingException, IOException {

    controller.setProperty(AbstractHadoopProcessor.COMPRESSION_CODEC, CompressionType.BZIP.name());
    controller.setProperty(CreateHadoopSequenceFile.COMPRESSION_TYPE, SequenceFile.CompressionType.BLOCK.name());

    File inFile = inFiles[0];
    try (FileInputStream fin = new FileInputStream(inFile) ){
        controller.enqueue(fin);
    }
    controller.run();

    List<MockFlowFile> successSeqFiles = controller.getFlowFilesForRelationship(CreateHadoopSequenceFile.RELATIONSHIP_SUCCESS);
    List<MockFlowFile> failedFlowFiles = controller.getFlowFilesForRelationship(CreateHadoopSequenceFile.RELATIONSHIP_FAILURE);

    assertEquals(0, failedFlowFiles.size());
    assertEquals(1, successSeqFiles.size());

    MockFlowFile ff = successSeqFiles.iterator().next();
    byte[] data = ff.toByteArray();


    final String magicHeader = new String(data, 0, 3, "UTF-8");
    assertEquals("SEQ", magicHeader);
    // Format of header is SEQ followed by the version (1 byte).
    // Then, the length of the Key type (1 byte), then the Key type
    // Then, the length of the Value type(1 byte), then the Value type
    final String keyType = Text.class.getCanonicalName();
    final int valueTypeStart = 3 + 1 + 1 + keyType.length() + 1;
    final int valueTypeLength = data[5 + keyType.length()];
    final String valueType = BytesWritable.class.getCanonicalName();

    assertEquals(valueType.length(), valueTypeLength);
    assertEquals(valueType, new String(data, valueTypeStart, valueType.length(), "UTF-8"));

    final int compressionIndex = 3 + 1 + 1 + keyType.length() + 1 + valueType.length();
    final int blockCompressionIndex = compressionIndex + 1;

    assertEquals(1, data[compressionIndex]);
    assertEquals(1, data[blockCompressionIndex]);

    final int codecTypeSize = data[blockCompressionIndex + 1];
    final int codecTypeStartIndex = blockCompressionIndex + 2;

    assertEquals(BZip2Codec.class.getCanonicalName(), new String(data, codecTypeStartIndex, codecTypeSize, "UTF-8"));
}

org.apache.hadoop.io.compress.BZip2Codec Java Examples