org.apache.hadoop.util.ReflectionUtils#setConf

Source File: WritableComparator.java From big-c with Apache License 2.0

6 votes

/** Get a comparator for a {@link WritableComparable} implementation. */
public static WritableComparator get(
    Class<? extends WritableComparable> c, Configuration conf) {
  WritableComparator comparator = comparators.get(c);
  if (comparator == null) {
    // force the static initializers to run
    forceInit(c);
    // look to see if it is defined now
    comparator = comparators.get(c);
    // if not, use the generic one
    if (comparator == null) {
      comparator = new WritableComparator(c, conf, true);
    }
  }
  // Newly passed Configuration objects should be used.
  ReflectionUtils.setConf(comparator, conf);
  return comparator;
}

Source File: TestFixedLengthInputFormat.java From hadoop with Apache License 2.0

5 votes

/**
 * Test using the gzip codec with two input files.
 */
@Test (timeout=5000)
public void testGzipWithTwoInputs() throws IOException {
  CompressionCodec gzip = new GzipCodec();
  localFs.delete(workDir, true);
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  JobConf job = new JobConf(defaultConf);
  format.setRecordLength(job, 5);
  FileInputFormat.setInputPaths(job, workDir);
  ReflectionUtils.setConf(gzip, job);
  format.configure(job);
  // Create files with fixed length records with 5 byte long records.
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
      "one  two  threefour five six  seveneightnine ten  ");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
      "ten  nine eightsevensix  five four threetwo  one  ");
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<String> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 10, results.size());
  assertEquals("splits[0][5]", "six  ", results.get(5));
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 10, results.size());
  assertEquals("splits[1][0]", "ten  ", results.get(0));
  assertEquals("splits[1][1]", "nine ", results.get(1));
}

Source File: TestTextInputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Test using the gzip codec for reading
 */
public static void testGzip() throws IOException {
  JobConf job = new JobConf();
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  TextInputFormat format = new TextInputFormat();
  format.configure(job);
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}

Source File: SequenceFile.java From RDFS with Apache License 2.0

5 votes

/** Initialize. */
@SuppressWarnings("unchecked")
void init(Path name, Configuration conf, FSDataOutputStream out,
          Class keyClass, Class valClass,
          boolean compress, CompressionCodec codec, Metadata metadata) 
  throws IOException {
  this.conf = conf;
  this.out = out;
  this.keyClass = keyClass;
  this.valClass = valClass;
  this.compress = compress;
  this.codec = codec;
  this.metadata = metadata;
  SerializationFactory serializationFactory = new SerializationFactory(conf);
  this.keySerializer = serializationFactory.getSerializer(keyClass);
  this.keySerializer.open(buffer);
  this.uncompressedValSerializer = serializationFactory.getSerializer(valClass);
  this.uncompressedValSerializer.open(buffer);
  if (this.codec != null) {
    ReflectionUtils.setConf(this.codec, this.conf);
    this.compressor = CodecPool.getCompressor(this.codec);
    this.deflateFilter = this.codec.createOutputStream(buffer, compressor);
    this.deflateOut = 
      new DataOutputStream(new BufferedOutputStream(deflateFilter));
    this.compressedValSerializer = serializationFactory.getSerializer(valClass);
    this.compressedValSerializer.open(deflateOut);
  }
}

Source File: TestFixedLengthInputFormat.java From big-c with Apache License 2.0

5 votes

/**
 * Test using the gzip codec with two input files.
 */
@Test (timeout=5000)
public void testGzipWithTwoInputs() throws Exception {
  CompressionCodec gzip = new GzipCodec();
  localFs.delete(workDir, true);
  Job job = Job.getInstance(defaultConf);
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  format.setRecordLength(job.getConfiguration(), 5);
  ReflectionUtils.setConf(gzip, job.getConfiguration());
  FileInputFormat.setInputPaths(job, workDir);
  // Create files with fixed length records with 5 byte long records.
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
      "one  two  threefour five six  seveneightnine ten  ");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
      "ten  nine eightsevensix  five four threetwo  one  ");
  List<InputSplit> splits = format.getSplits(job);
  assertEquals("compressed splits == 2", 2, splits.size());
  FileSplit tmp = (FileSplit) splits.get(0);
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits.set(0, splits.get(1));
    splits.set(1, tmp);
  }
  List<String> results = readSplit(format, splits.get(0), job);
  assertEquals("splits[0] length", 10, results.size());
  assertEquals("splits[0][5]", "six  ", results.get(5));
  results = readSplit(format, splits.get(1), job);
  assertEquals("splits[1] length", 10, results.size());
  assertEquals("splits[1][0]", "ten  ", results.get(0));
  assertEquals("splits[1][1]", "nine ", results.get(1));
}

Source File: TestFixedLengthInputFormat.java From hadoop with Apache License 2.0

5 votes

/**
 * Test using the gzip codec with two input files.
 */
@Test (timeout=5000)
public void testGzipWithTwoInputs() throws Exception {
  CompressionCodec gzip = new GzipCodec();
  localFs.delete(workDir, true);
  Job job = Job.getInstance(defaultConf);
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  format.setRecordLength(job.getConfiguration(), 5);
  ReflectionUtils.setConf(gzip, job.getConfiguration());
  FileInputFormat.setInputPaths(job, workDir);
  // Create files with fixed length records with 5 byte long records.
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
      "one  two  threefour five six  seveneightnine ten  ");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
      "ten  nine eightsevensix  five four threetwo  one  ");
  List<InputSplit> splits = format.getSplits(job);
  assertEquals("compressed splits == 2", 2, splits.size());
  FileSplit tmp = (FileSplit) splits.get(0);
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits.set(0, splits.get(1));
    splits.set(1, tmp);
  }
  List<String> results = readSplit(format, splits.get(0), job);
  assertEquals("splits[0] length", 10, results.size());
  assertEquals("splits[0][5]", "six  ", results.get(5));
  results = readSplit(format, splits.get(1), job);
  assertEquals("splits[1] length", 10, results.size());
  assertEquals("splits[1][0]", "ten  ", results.get(0));
  assertEquals("splits[1][1]", "nine ", results.get(1));
}

Source File: TestCombineTextInputFormat.java From hadoop with Apache License 2.0

5 votes

/**
 * Test using the gzip codec for reading
 */
@Test(timeout=10000)
public void testGzip() throws IOException, InterruptedException {
  Configuration conf = new Configuration(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, conf);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip,
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  Job job = Job.getInstance(conf);
  FileInputFormat.setInputPaths(job, workDir);
  CombineTextInputFormat format = new CombineTextInputFormat();
  List<InputSplit> splits = format.getSplits(job);
  assertEquals("compressed splits == 1", 1, splits.size());
  List<Text> results = readSplit(format, splits.get(0), job);
  assertEquals("splits[0] length", 8, results.size());

  final String[] firstList =
    {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"};
  final String[] secondList = {"this is a test", "of gzip"};
  String first = results.get(0).toString();
  if (first.equals(firstList[0])) {
    testResults(results, firstList, secondList);
  } else if (first.equals(secondList[0])) {
    testResults(results, secondList, firstList);
  } else {
    fail("unexpected first token!");
  }
}

Source File: HadoopInputFormatBase.java From Flink-CEPplus with Apache License 2.0

5 votes

public HadoopInputFormatBase(org.apache.hadoop.mapred.InputFormat<K, V> mapredInputFormat, Class<K> key, Class<V> value, JobConf job) {
	super(job.getCredentials());
	this.mapredInputFormat = mapredInputFormat;
	this.keyClass = key;
	this.valueClass = value;
	HadoopUtils.mergeHadoopConf(job);
	this.jobConf = job;
	ReflectionUtils.setConf(mapredInputFormat, jobConf);
}

Source File: TestCombineTextInputFormat.java From hadoop with Apache License 2.0

5 votes

/**
 * Test using the gzip codec for reading
 */
@Test(timeout=10000)
public void testGzip() throws IOException {
  JobConf job = new JobConf(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip,
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  CombineTextInputFormat format = new CombineTextInputFormat();
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 1", 1, splits.length);
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 8, results.size());

  final String[] firstList =
    {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"};
  final String[] secondList = {"this is a test", "of gzip"};
  String first = results.get(0).toString();
  if (first.equals(firstList[0])) {
    testResults(results, firstList, secondList);
  } else if (first.equals(secondList[0])) {
    testResults(results, secondList, firstList);
  } else {
    fail("unexpected first token!");
  }
}

Source File: TestKeyValueTextInputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Test using the gzip codec for reading
 */
public static void testGzip() throws IOException {
  JobConf job = new JobConf();
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "line-1\tthis is a test\nline-1\tof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  KeyValueTextInputFormat format = new KeyValueTextInputFormat();
  format.configure(job);
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}

Source File: TestTextInputFormat.java From hadoop with Apache License 2.0

5 votes

/**
 * Test using the gzip codec for reading
 */
@Test (timeout=5000)
public void testGzip() throws IOException {
  JobConf job = new JobConf(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  TextInputFormat format = new TextInputFormat();
  format.configure(job);
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}

Source File: TestKeyValueTextInputFormat.java From hadoop with Apache License 2.0

5 votes

/**
 * Test using the gzip codec for reading
 */
public static void testGzip() throws IOException {
  JobConf job = new JobConf();
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "line-1\tthis is a test\nline-1\tof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  KeyValueTextInputFormat format = new KeyValueTextInputFormat();
  format.configure(job);
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}

Source File: TestTextInputFormat.java From RDFS with Apache License 2.0

5 votes

/**
 * Test using the gzip codec for reading
 */
public static void testGzip() throws IOException {
  JobConf job = new JobConf();
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "this is a test\nof gzip\n");
  FileInputFormat.setInputPaths(job, workDir);
  TextInputFormat format = new TextInputFormat();
  format.configure(job);
  InputSplit[] splits = format.getSplits(job, 100);
  assertEquals("compressed splits == 2", 2, splits.length);
  FileSplit tmp = (FileSplit) splits[0];
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits[0] = splits[1];
    splits[1] = tmp;
  }
  List<Text> results = readSplit(format, splits[0], job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits[1], job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}

Source File: TestFixedLengthInputFormat.java From big-c with Apache License 2.0

5 votes

private void runPartialRecordTest(CompressionCodec codec) throws IOException {
  localFs.delete(workDir, true);
  // Create a file with fixed length records with 5 byte long
  // records with a partial record at the end.
  StringBuilder fileName = new StringBuilder("testFormat.txt");
  if (codec != null) {
    fileName.append(".gz");
  }
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  JobConf job = new JobConf(defaultConf);
  format.setRecordLength(job, 5);
  FileInputFormat.setInputPaths(job, workDir);
  if (codec != null) {
    ReflectionUtils.setConf(codec, job);
  }
  format.configure(job);
  writeFile(localFs, new Path(workDir, fileName.toString()), codec,
          "one  two  threefour five six  seveneightnine ten");
  InputSplit[] splits = format.getSplits(job, 100);
  if (codec != null) {
    assertEquals("compressed splits == 1", 1, splits.length);
  }
  boolean exceptionThrown = false;
  for (InputSplit split : splits) {
    try {
      List<String> results = readSplit(format, split, job);
    } catch(IOException ioe) {
      exceptionThrown = true;
      LOG.info("Exception message:" + ioe.getMessage());
    }
  }
  assertTrue("Exception for partial record:", exceptionThrown);
}

Source File: HadoopInputFormatBase.java From flink with Apache License 2.0

5 votes

public HadoopInputFormatBase(org.apache.hadoop.mapred.InputFormat<K, V> mapredInputFormat, Class<K> key, Class<V> value, JobConf job) {
	super(job.getCredentials());
	this.mapredInputFormat = mapredInputFormat;
	this.keyClass = key;
	this.valueClass = value;
	HadoopUtils.mergeHadoopConf(job);
	this.jobConf = job;
	ReflectionUtils.setConf(mapredInputFormat, jobConf);
}

Source File: TestFixedLengthInputFormat.java From hadoop with Apache License 2.0

4 votes

private void runRandomTests(CompressionCodec codec) throws IOException {
  StringBuilder fileName = new StringBuilder("testFormat.txt");
  if (codec != null) {
    fileName.append(".gz");
  }
  localFs.delete(workDir, true);
  Path file = new Path(workDir, fileName.toString());
  int seed = new Random().nextInt();
  LOG.info("Seed = " + seed);
  Random random = new Random(seed);
  int MAX_TESTS = 20;
  LongWritable key = new LongWritable();
  BytesWritable value = new BytesWritable();

  for (int i = 0; i < MAX_TESTS; i++) {
    LOG.info("----------------------------------------------------------");
    // Maximum total records of 999
    int totalRecords = random.nextInt(999)+1;
    // Test an empty file
    if (i == 8) {
       totalRecords = 0;
    }
    // Maximum bytes in a record of 100K
    int recordLength = random.nextInt(1024*100)+1;
    // For the 11th test, force a record length of 1
    if (i == 10) {
      recordLength = 1;
    }
    // The total bytes in the test file
    int fileSize = (totalRecords * recordLength);
    LOG.info("totalRecords=" + totalRecords + " recordLength="
        + recordLength);
    // Create the job 
    JobConf job = new JobConf(defaultConf);
    if (codec != null) {
      ReflectionUtils.setConf(codec, job);
    }
    // Create the test file
    ArrayList<String> recordList
        = createFile(file, codec, recordLength, totalRecords);
    assertTrue(localFs.exists(file));
    //set the fixed length record length config property for the job
    FixedLengthInputFormat.setRecordLength(job, recordLength);

    int numSplits = 1;
    // Arbitrarily set number of splits.
    if (i > 0) {
      if (i == (MAX_TESTS-1)) {
        // Test a split size that is less than record len
        numSplits = (int)(fileSize/Math.floor(recordLength/2));
      } else {
        if (MAX_TESTS % i == 0) {
          // Let us create a split size that is forced to be 
          // smaller than the end file itself, (ensures 1+ splits)
          numSplits = fileSize/(fileSize - random.nextInt(fileSize));
        } else {
          // Just pick a random split size with no upper bound 
          numSplits = Math.max(1, fileSize/random.nextInt(Integer.MAX_VALUE));
        }
      }
      LOG.info("Number of splits set to: " + numSplits);
    }

    // Setup the input path
    FileInputFormat.setInputPaths(job, workDir);
    // Try splitting the file in a variety of sizes
    FixedLengthInputFormat format = new FixedLengthInputFormat();
    format.configure(job);
    InputSplit splits[] = format.getSplits(job, numSplits);
    LOG.info("Actual number of splits = " + splits.length);
    // Test combined split lengths = total file size
    long recordOffset = 0;
    int recordNumber = 0;
    for (InputSplit split : splits) {
      RecordReader<LongWritable, BytesWritable> reader = 
          format.getRecordReader(split, job, voidReporter);
      Class<?> clazz = reader.getClass();
      assertEquals("RecordReader class should be FixedLengthRecordReader:", 
          FixedLengthRecordReader.class, clazz);
      // Plow through the records in this split
      while (reader.next(key, value)) {
        assertEquals("Checking key", (long)(recordNumber*recordLength),
            key.get());
        String valueString =
            new String(value.getBytes(), 0, value.getLength());
        assertEquals("Checking record length:", recordLength,
            value.getLength());
        assertTrue("Checking for more records than expected:",
            recordNumber < totalRecords);
        String origRecord = recordList.get(recordNumber);
        assertEquals("Checking record content:", origRecord, valueString);
        recordNumber++;
      }
      reader.close();
    }
    assertEquals("Total original records should be total read records:",
        recordList.size(), recordNumber);
  }
}

Source File: TestConcatenatedCompressedInput.java From big-c with Apache License 2.0

4 votes

/**
   * Test using Hadoop's original, native-zlib gzip codec for reading.
   */
  @Test
  public void testGzip() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);

    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, jobConf);
    localFs.delete(workDir, true);

    // preferred, but not compatible with Apache/trunk instance of Hudson:
/*
    assertFalse("[native (C/C++) codec]",
      (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
       gzip.getDecompressorType()) );
    System.out.println(COLOR_BR_RED +
      "testGzip() using native-zlib Decompressor (" +
      gzip.getDecompressorType() + ")" + COLOR_NORMAL);
 */

    // alternative:
    if (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class ==
        gzip.getDecompressorType()) {
      System.out.println(COLOR_BR_RED +
        "testGzip() using native-zlib Decompressor (" +
        gzip.getDecompressorType() + ")" + COLOR_NORMAL);
    } else {
      LOG.warn("testGzip() skipped:  native (C/C++) libs not loaded");
      return;
    }

/*
 *      // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs--
 *      //                see https://issues.apache.org/jira/browse/HADOOP-6799
 *  Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension());
 *  //OutputStream out = localFs.create(fnHDFS);
 *  //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out);
 *      // can just combine those two lines, probably
 *  //GzipCodec.GzipOutputStream gzOStm =
 *  //  new GzipCodec.GzipOutputStream(localFs.create(fnHDFS));
 *      // oops, no:  this is a protected helper class; need to access
 *      //   it via createOutputStream() instead:
 *  OutputStream out = localFs.create(fnHDFS);
 *  Compressor gzCmp = gzip.createCompressor();
 *  CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp);
 *      // this SHOULD be going to HDFS:  got out from localFs == HDFS
 *      //   ...yup, works
 *  gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes());
 *  gzOStm.finish();
 *  gzOStm.resetState();
 *  gzOStm.write("2nd gzip concat member\n".getBytes());
 *  gzOStm.finish();
 *  gzOStm.resetState();
 *  gzOStm.write("gzip concat\nmember #3\n".getBytes());
 *  gzOStm.close();
 *      //
 *  String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension();
 *  Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn);
 *  localFs.copyToLocalFile(fnHDFS, fnLocal);
 */

    // copy prebuilt (correct!) version of concat.gz to HDFS
    final String fn = "concat" + gzip.getDefaultExtension();
    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
    Path fnHDFS  = new Path(workDir, fn);
    localFs.copyFromLocalFile(fnLocal, fnHDFS);

    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
              "this is a test\nof gzip\n");
    FileInputFormat.setInputPaths(jobConf, workDir);
    TextInputFormat format = new TextInputFormat();
    format.configure(jobConf);

    InputSplit[] splits = format.getSplits(jobConf, 100);
    assertEquals("compressed splits == 2", 2, splits.length);
    FileSplit tmp = (FileSplit) splits[0];
    if (tmp.getPath().getName().equals("part2.txt.gz")) {
      splits[0] = splits[1];
      splits[1] = tmp;
    }

    List<Text> results = readSplit(format, splits[0], jobConf);
    assertEquals("splits[0] num lines", 6, results.size());
    assertEquals("splits[0][5]", "member #3",
                 results.get(5).toString());

    results = readSplit(format, splits[1], jobConf);
    assertEquals("splits[1] num lines", 2, results.size());
    assertEquals("splits[1][0]", "this is a test",
                 results.get(0).toString());
    assertEquals("splits[1][1]", "of gzip",
                 results.get(1).toString());
  }

Source File: TestGroupedSplits.java From tez with Apache License 2.0

4 votes

/**
 * Test using the gzip codec for reading
 */
@Test(timeout=10000)
public void testGzip() throws IOException {
  JobConf job = new JobConf(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, job);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip,
            "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "is\ngzip\n");
  writeFile(localFs, new Path(workDir, "part3.txt.gz"), gzip,
      "one\nmore\nsplit\n");
  FileInputFormat.setInputPaths(job, workDir);
  TextInputFormat wrappedFormat = new TextInputFormat();
  wrappedFormat.configure(job);
  TezGroupedSplitsInputFormat<LongWritable , Text> format = 
      new TezGroupedSplitsInputFormat<LongWritable, Text>();
  format.setConf(job);
  format.setInputFormat(wrappedFormat);
  
  // TextInputFormat will produce 3 splits
  for (int j=1; j<=3; ++j) {
    format.setDesiredNumberOfSplits(j);
    InputSplit[] splits = format.getSplits(job, 100);
    if (j==1) {
      // j==1 covers single split corner case
      // and does not do grouping
      assertEquals("compressed splits == " + j, j, splits.length);
    }
    List<Text> results = new ArrayList<Text>();
    for (int i=0; i<splits.length; ++i) { 
      List<Text> read = readSplit(format, splits[i], job);
      results.addAll(read);
    }
    assertEquals("splits length", 11, results.size());

    final String[] firstList =
      {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"};
    final String[] secondList = {"is", "gzip"};
    final String[] thirdList = {"one", "more", "split"};
    String first = results.get(0).toString();
    int start = 0;
    switch (first.charAt(0)) {
    case 't':
      start = testResults(results, firstList, start);
      break;
    case 'i':
      start = testResults(results, secondList, start);
      break;
    case 'o':
      start = testResults(results, thirdList, start);
      break;
    default:
      Assert.fail("unexpected first token - " + first);
    }
  }
}

Source File: TestConcatenatedCompressedInput.java From hadoop with Apache License 2.0

4 votes

/**
   * Extended bzip2 test, similar to BuiltInGzipDecompressor test above.
   */
  @Test
  public void testMoreBzip2() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);

    CompressionCodec bzip2 = new BZip2Codec();
    ReflectionUtils.setConf(bzip2, jobConf);
    localFs.delete(workDir, true);

    System.out.println(COLOR_BR_MAGENTA +
      "testMoreBzip2() using non-native CBZip2InputStream (presumably)" +
      COLOR_NORMAL);

    // copy single-member test file to HDFS
    String fn1 = "testConcatThenCompress.txt" + bzip2.getDefaultExtension();
    Path fnLocal1 = new Path(System.getProperty("test.concat.data","/tmp"),fn1);
    Path fnHDFS1  = new Path(workDir, fn1);
    localFs.copyFromLocalFile(fnLocal1, fnHDFS1);

    // copy multiple-member test file to HDFS
    String fn2 = "testCompressThenConcat.txt" + bzip2.getDefaultExtension();
    Path fnLocal2 = new Path(System.getProperty("test.concat.data","/tmp"),fn2);
    Path fnHDFS2  = new Path(workDir, fn2);
    localFs.copyFromLocalFile(fnLocal2, fnHDFS2);

    FileInputFormat.setInputPaths(jobConf, workDir);

    // here's first pair of BlockDecompressorStreams:
    final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
    final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
    assertEquals("concat bytes available", 2567, in1.available());
    assertEquals("concat bytes available", 3056, in2.available());

/*
    // FIXME
    // The while-loop below dies at the beginning of the 2nd concatenated
    // member (after 17 lines successfully read) with:
    //
    //   java.io.IOException: bad block header
    //   at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock(
    //   CBZip2InputStream.java:527)
    //
    // It is not critical to concatenated-gzip support, HADOOP-6835, so it's
    // simply commented out for now (and HADOOP-6852 filed).  If and when the
    // latter issue is resolved--perhaps by fixing an error here--this code
    // should be reenabled.  Note that the doMultipleBzip2BufferSizes() test
    // below uses the same testCompressThenConcat.txt.bz2 file but works fine.

    CompressionInputStream cin2 = bzip2.createInputStream(in2);
    LineReader in = new LineReader(cin2);
    Text out = new Text();

    int numBytes, totalBytes=0, lineNum=0;
    while ((numBytes = in.readLine(out)) > 0) {
      ++lineNum;
      totalBytes += numBytes;
    }
    in.close();
    assertEquals("total uncompressed bytes in concatenated test file",
                 5346, totalBytes);
    assertEquals("total uncompressed lines in concatenated test file",
                 84, lineNum);
 */

    // test CBZip2InputStream with lots of different input-buffer sizes
    doMultipleBzip2BufferSizes(jobConf, false);

    // no native version of bzip2 codec (yet?)
    //doMultipleBzip2BufferSizes(jobConf, true);
  }

Source File: TestFixedLengthInputFormat.java From big-c with Apache License 2.0

4 votes

private void runRandomTests(CompressionCodec codec) throws IOException {
  StringBuilder fileName = new StringBuilder("testFormat.txt");
  if (codec != null) {
    fileName.append(".gz");
  }
  localFs.delete(workDir, true);
  Path file = new Path(workDir, fileName.toString());
  int seed = new Random().nextInt();
  LOG.info("Seed = " + seed);
  Random random = new Random(seed);
  int MAX_TESTS = 20;
  LongWritable key = new LongWritable();
  BytesWritable value = new BytesWritable();

  for (int i = 0; i < MAX_TESTS; i++) {
    LOG.info("----------------------------------------------------------");
    // Maximum total records of 999
    int totalRecords = random.nextInt(999)+1;
    // Test an empty file
    if (i == 8) {
       totalRecords = 0;
    }
    // Maximum bytes in a record of 100K
    int recordLength = random.nextInt(1024*100)+1;
    // For the 11th test, force a record length of 1
    if (i == 10) {
      recordLength = 1;
    }
    // The total bytes in the test file
    int fileSize = (totalRecords * recordLength);
    LOG.info("totalRecords=" + totalRecords + " recordLength="
        + recordLength);
    // Create the job 
    JobConf job = new JobConf(defaultConf);
    if (codec != null) {
      ReflectionUtils.setConf(codec, job);
    }
    // Create the test file
    ArrayList<String> recordList
        = createFile(file, codec, recordLength, totalRecords);
    assertTrue(localFs.exists(file));
    //set the fixed length record length config property for the job
    FixedLengthInputFormat.setRecordLength(job, recordLength);

    int numSplits = 1;
    // Arbitrarily set number of splits.
    if (i > 0) {
      if (i == (MAX_TESTS-1)) {
        // Test a split size that is less than record len
        numSplits = (int)(fileSize/Math.floor(recordLength/2));
      } else {
        if (MAX_TESTS % i == 0) {
          // Let us create a split size that is forced to be 
          // smaller than the end file itself, (ensures 1+ splits)
          numSplits = fileSize/(fileSize - random.nextInt(fileSize));
        } else {
          // Just pick a random split size with no upper bound 
          numSplits = Math.max(1, fileSize/random.nextInt(Integer.MAX_VALUE));
        }
      }
      LOG.info("Number of splits set to: " + numSplits);
    }

    // Setup the input path
    FileInputFormat.setInputPaths(job, workDir);
    // Try splitting the file in a variety of sizes
    FixedLengthInputFormat format = new FixedLengthInputFormat();
    format.configure(job);
    InputSplit splits[] = format.getSplits(job, numSplits);
    LOG.info("Actual number of splits = " + splits.length);
    // Test combined split lengths = total file size
    long recordOffset = 0;
    int recordNumber = 0;
    for (InputSplit split : splits) {
      RecordReader<LongWritable, BytesWritable> reader = 
          format.getRecordReader(split, job, voidReporter);
      Class<?> clazz = reader.getClass();
      assertEquals("RecordReader class should be FixedLengthRecordReader:", 
          FixedLengthRecordReader.class, clazz);
      // Plow through the records in this split
      while (reader.next(key, value)) {
        assertEquals("Checking key", (long)(recordNumber*recordLength),
            key.get());
        String valueString =
            new String(value.getBytes(), 0, value.getLength());
        assertEquals("Checking record length:", recordLength,
            value.getLength());
        assertTrue("Checking for more records than expected:",
            recordNumber < totalRecords);
        String origRecord = recordList.get(recordNumber);
        assertEquals("Checking record content:", origRecord, valueString);
        recordNumber++;
      }
      reader.close();
    }
    assertEquals("Total original records should be total read records:",
        recordList.size(), recordNumber);
  }
}

Java Code Examples for org.apache.hadoop.util.ReflectionUtils#setConf()