Java Code Examples for org.apache.hadoop.util.ReflectionUtils#setConf()
The following examples show how to use
org.apache.hadoop.util.ReflectionUtils#setConf() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WritableComparator.java From big-c with Apache License 2.0 | 6 votes |
/** Get a comparator for a {@link WritableComparable} implementation. */ public static WritableComparator get( Class<? extends WritableComparable> c, Configuration conf) { WritableComparator comparator = comparators.get(c); if (comparator == null) { // force the static initializers to run forceInit(c); // look to see if it is defined now comparator = comparators.get(c); // if not, use the generic one if (comparator == null) { comparator = new WritableComparator(c, conf, true); } } // Newly passed Configuration objects should be used. ReflectionUtils.setConf(comparator, conf); return comparator; }
Example 2
Source File: TestFixedLengthInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec with two input files. */ @Test (timeout=5000) public void testGzipWithTwoInputs() throws IOException { CompressionCodec gzip = new GzipCodec(); localFs.delete(workDir, true); FixedLengthInputFormat format = new FixedLengthInputFormat(); JobConf job = new JobConf(defaultConf); format.setRecordLength(job, 5); FileInputFormat.setInputPaths(job, workDir); ReflectionUtils.setConf(gzip, job); format.configure(job); // Create files with fixed length records with 5 byte long records. writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one two threefour five six seveneightnine ten "); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten nine eightsevensix five four threetwo one "); InputSplit[] splits = format.getSplits(job, 100); assertEquals("compressed splits == 2", 2, splits.length); FileSplit tmp = (FileSplit) splits[0]; if (tmp.getPath().getName().equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } List<String> results = readSplit(format, splits[0], job); assertEquals("splits[0] length", 10, results.size()); assertEquals("splits[0][5]", "six ", results.get(5)); results = readSplit(format, splits[1], job); assertEquals("splits[1] length", 10, results.size()); assertEquals("splits[1][0]", "ten ", results.get(0)); assertEquals("splits[1][1]", "nine ", results.get(1)); }
Example 3
Source File: TestTextInputFormat.java From hadoop-gpu with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec for reading */ public static void testGzip() throws IOException { JobConf job = new JobConf(); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"); FileInputFormat.setInputPaths(job, workDir); TextInputFormat format = new TextInputFormat(); format.configure(job); InputSplit[] splits = format.getSplits(job, 100); assertEquals("compressed splits == 2", 2, splits.length); FileSplit tmp = (FileSplit) splits[0]; if (tmp.getPath().getName().equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } List<Text> results = readSplit(format, splits[0], job); assertEquals("splits[0] length", 6, results.size()); assertEquals("splits[0][5]", " dog", results.get(5).toString()); results = readSplit(format, splits[1], job); assertEquals("splits[1] length", 2, results.size()); assertEquals("splits[1][0]", "this is a test", results.get(0).toString()); assertEquals("splits[1][1]", "of gzip", results.get(1).toString()); }
Example 4
Source File: SequenceFile.java From RDFS with Apache License 2.0 | 5 votes |
/** Initialize. */ @SuppressWarnings("unchecked") void init(Path name, Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass, boolean compress, CompressionCodec codec, Metadata metadata) throws IOException { this.conf = conf; this.out = out; this.keyClass = keyClass; this.valClass = valClass; this.compress = compress; this.codec = codec; this.metadata = metadata; SerializationFactory serializationFactory = new SerializationFactory(conf); this.keySerializer = serializationFactory.getSerializer(keyClass); this.keySerializer.open(buffer); this.uncompressedValSerializer = serializationFactory.getSerializer(valClass); this.uncompressedValSerializer.open(buffer); if (this.codec != null) { ReflectionUtils.setConf(this.codec, this.conf); this.compressor = CodecPool.getCompressor(this.codec); this.deflateFilter = this.codec.createOutputStream(buffer, compressor); this.deflateOut = new DataOutputStream(new BufferedOutputStream(deflateFilter)); this.compressedValSerializer = serializationFactory.getSerializer(valClass); this.compressedValSerializer.open(deflateOut); } }
Example 5
Source File: TestFixedLengthInputFormat.java From big-c with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec with two input files. */ @Test (timeout=5000) public void testGzipWithTwoInputs() throws Exception { CompressionCodec gzip = new GzipCodec(); localFs.delete(workDir, true); Job job = Job.getInstance(defaultConf); FixedLengthInputFormat format = new FixedLengthInputFormat(); format.setRecordLength(job.getConfiguration(), 5); ReflectionUtils.setConf(gzip, job.getConfiguration()); FileInputFormat.setInputPaths(job, workDir); // Create files with fixed length records with 5 byte long records. writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one two threefour five six seveneightnine ten "); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten nine eightsevensix five four threetwo one "); List<InputSplit> splits = format.getSplits(job); assertEquals("compressed splits == 2", 2, splits.size()); FileSplit tmp = (FileSplit) splits.get(0); if (tmp.getPath().getName().equals("part2.txt.gz")) { splits.set(0, splits.get(1)); splits.set(1, tmp); } List<String> results = readSplit(format, splits.get(0), job); assertEquals("splits[0] length", 10, results.size()); assertEquals("splits[0][5]", "six ", results.get(5)); results = readSplit(format, splits.get(1), job); assertEquals("splits[1] length", 10, results.size()); assertEquals("splits[1][0]", "ten ", results.get(0)); assertEquals("splits[1][1]", "nine ", results.get(1)); }
Example 6
Source File: TestFixedLengthInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec with two input files. */ @Test (timeout=5000) public void testGzipWithTwoInputs() throws Exception { CompressionCodec gzip = new GzipCodec(); localFs.delete(workDir, true); Job job = Job.getInstance(defaultConf); FixedLengthInputFormat format = new FixedLengthInputFormat(); format.setRecordLength(job.getConfiguration(), 5); ReflectionUtils.setConf(gzip, job.getConfiguration()); FileInputFormat.setInputPaths(job, workDir); // Create files with fixed length records with 5 byte long records. writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one two threefour five six seveneightnine ten "); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten nine eightsevensix five four threetwo one "); List<InputSplit> splits = format.getSplits(job); assertEquals("compressed splits == 2", 2, splits.size()); FileSplit tmp = (FileSplit) splits.get(0); if (tmp.getPath().getName().equals("part2.txt.gz")) { splits.set(0, splits.get(1)); splits.set(1, tmp); } List<String> results = readSplit(format, splits.get(0), job); assertEquals("splits[0] length", 10, results.size()); assertEquals("splits[0][5]", "six ", results.get(5)); results = readSplit(format, splits.get(1), job); assertEquals("splits[1] length", 10, results.size()); assertEquals("splits[1][0]", "ten ", results.get(0)); assertEquals("splits[1][1]", "nine ", results.get(1)); }
Example 7
Source File: TestCombineTextInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec for reading */ @Test(timeout=10000) public void testGzip() throws IOException, InterruptedException { Configuration conf = new Configuration(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, conf); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"); Job job = Job.getInstance(conf); FileInputFormat.setInputPaths(job, workDir); CombineTextInputFormat format = new CombineTextInputFormat(); List<InputSplit> splits = format.getSplits(job); assertEquals("compressed splits == 1", 1, splits.size()); List<Text> results = readSplit(format, splits.get(0), job); assertEquals("splits[0] length", 8, results.size()); final String[] firstList = {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"}; final String[] secondList = {"this is a test", "of gzip"}; String first = results.get(0).toString(); if (first.equals(firstList[0])) { testResults(results, firstList, secondList); } else if (first.equals(secondList[0])) { testResults(results, secondList, firstList); } else { fail("unexpected first token!"); } }
Example 8
Source File: HadoopInputFormatBase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
public HadoopInputFormatBase(org.apache.hadoop.mapred.InputFormat<K, V> mapredInputFormat, Class<K> key, Class<V> value, JobConf job) { super(job.getCredentials()); this.mapredInputFormat = mapredInputFormat; this.keyClass = key; this.valueClass = value; HadoopUtils.mergeHadoopConf(job); this.jobConf = job; ReflectionUtils.setConf(mapredInputFormat, jobConf); }
Example 9
Source File: TestCombineTextInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec for reading */ @Test(timeout=10000) public void testGzip() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"); FileInputFormat.setInputPaths(job, workDir); CombineTextInputFormat format = new CombineTextInputFormat(); InputSplit[] splits = format.getSplits(job, 100); assertEquals("compressed splits == 1", 1, splits.length); List<Text> results = readSplit(format, splits[0], job); assertEquals("splits[0] length", 8, results.size()); final String[] firstList = {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"}; final String[] secondList = {"this is a test", "of gzip"}; String first = results.get(0).toString(); if (first.equals(firstList[0])) { testResults(results, firstList, secondList); } else if (first.equals(secondList[0])) { testResults(results, secondList, firstList); } else { fail("unexpected first token!"); } }
Example 10
Source File: TestKeyValueTextInputFormat.java From hadoop-gpu with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec for reading */ public static void testGzip() throws IOException { JobConf job = new JobConf(); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n"); FileInputFormat.setInputPaths(job, workDir); KeyValueTextInputFormat format = new KeyValueTextInputFormat(); format.configure(job); InputSplit[] splits = format.getSplits(job, 100); assertEquals("compressed splits == 2", 2, splits.length); FileSplit tmp = (FileSplit) splits[0]; if (tmp.getPath().getName().equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } List<Text> results = readSplit(format, splits[0], job); assertEquals("splits[0] length", 6, results.size()); assertEquals("splits[0][5]", " dog", results.get(5).toString()); results = readSplit(format, splits[1], job); assertEquals("splits[1] length", 2, results.size()); assertEquals("splits[1][0]", "this is a test", results.get(0).toString()); assertEquals("splits[1][1]", "of gzip", results.get(1).toString()); }
Example 11
Source File: TestTextInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec for reading */ @Test (timeout=5000) public void testGzip() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"); FileInputFormat.setInputPaths(job, workDir); TextInputFormat format = new TextInputFormat(); format.configure(job); InputSplit[] splits = format.getSplits(job, 100); assertEquals("compressed splits == 2", 2, splits.length); FileSplit tmp = (FileSplit) splits[0]; if (tmp.getPath().getName().equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } List<Text> results = readSplit(format, splits[0], job); assertEquals("splits[0] length", 6, results.size()); assertEquals("splits[0][5]", " dog", results.get(5).toString()); results = readSplit(format, splits[1], job); assertEquals("splits[1] length", 2, results.size()); assertEquals("splits[1][0]", "this is a test", results.get(0).toString()); assertEquals("splits[1][1]", "of gzip", results.get(1).toString()); }
Example 12
Source File: TestKeyValueTextInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec for reading */ public static void testGzip() throws IOException { JobConf job = new JobConf(); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "line-1\tthe quick\nline-2\tbrown\nline-3\tfox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "line-1\tthis is a test\nline-1\tof gzip\n"); FileInputFormat.setInputPaths(job, workDir); KeyValueTextInputFormat format = new KeyValueTextInputFormat(); format.configure(job); InputSplit[] splits = format.getSplits(job, 100); assertEquals("compressed splits == 2", 2, splits.length); FileSplit tmp = (FileSplit) splits[0]; if (tmp.getPath().getName().equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } List<Text> results = readSplit(format, splits[0], job); assertEquals("splits[0] length", 6, results.size()); assertEquals("splits[0][5]", " dog", results.get(5).toString()); results = readSplit(format, splits[1], job); assertEquals("splits[1] length", 2, results.size()); assertEquals("splits[1][0]", "this is a test", results.get(0).toString()); assertEquals("splits[1][1]", "of gzip", results.get(1).toString()); }
Example 13
Source File: TestTextInputFormat.java From RDFS with Apache License 2.0 | 5 votes |
/** * Test using the gzip codec for reading */ public static void testGzip() throws IOException { JobConf job = new JobConf(); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"); FileInputFormat.setInputPaths(job, workDir); TextInputFormat format = new TextInputFormat(); format.configure(job); InputSplit[] splits = format.getSplits(job, 100); assertEquals("compressed splits == 2", 2, splits.length); FileSplit tmp = (FileSplit) splits[0]; if (tmp.getPath().getName().equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } List<Text> results = readSplit(format, splits[0], job); assertEquals("splits[0] length", 6, results.size()); assertEquals("splits[0][5]", " dog", results.get(5).toString()); results = readSplit(format, splits[1], job); assertEquals("splits[1] length", 2, results.size()); assertEquals("splits[1][0]", "this is a test", results.get(0).toString()); assertEquals("splits[1][1]", "of gzip", results.get(1).toString()); }
Example 14
Source File: TestFixedLengthInputFormat.java From big-c with Apache License 2.0 | 5 votes |
private void runPartialRecordTest(CompressionCodec codec) throws IOException { localFs.delete(workDir, true); // Create a file with fixed length records with 5 byte long // records with a partial record at the end. StringBuilder fileName = new StringBuilder("testFormat.txt"); if (codec != null) { fileName.append(".gz"); } FixedLengthInputFormat format = new FixedLengthInputFormat(); JobConf job = new JobConf(defaultConf); format.setRecordLength(job, 5); FileInputFormat.setInputPaths(job, workDir); if (codec != null) { ReflectionUtils.setConf(codec, job); } format.configure(job); writeFile(localFs, new Path(workDir, fileName.toString()), codec, "one two threefour five six seveneightnine ten"); InputSplit[] splits = format.getSplits(job, 100); if (codec != null) { assertEquals("compressed splits == 1", 1, splits.length); } boolean exceptionThrown = false; for (InputSplit split : splits) { try { List<String> results = readSplit(format, split, job); } catch(IOException ioe) { exceptionThrown = true; LOG.info("Exception message:" + ioe.getMessage()); } } assertTrue("Exception for partial record:", exceptionThrown); }
Example 15
Source File: HadoopInputFormatBase.java From flink with Apache License 2.0 | 5 votes |
public HadoopInputFormatBase(org.apache.hadoop.mapred.InputFormat<K, V> mapredInputFormat, Class<K> key, Class<V> value, JobConf job) { super(job.getCredentials()); this.mapredInputFormat = mapredInputFormat; this.keyClass = key; this.valueClass = value; HadoopUtils.mergeHadoopConf(job); this.jobConf = job; ReflectionUtils.setConf(mapredInputFormat, jobConf); }
Example 16
Source File: TestFixedLengthInputFormat.java From hadoop with Apache License 2.0 | 4 votes |
private void runRandomTests(CompressionCodec codec) throws IOException { StringBuilder fileName = new StringBuilder("testFormat.txt"); if (codec != null) { fileName.append(".gz"); } localFs.delete(workDir, true); Path file = new Path(workDir, fileName.toString()); int seed = new Random().nextInt(); LOG.info("Seed = " + seed); Random random = new Random(seed); int MAX_TESTS = 20; LongWritable key = new LongWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; i < MAX_TESTS; i++) { LOG.info("----------------------------------------------------------"); // Maximum total records of 999 int totalRecords = random.nextInt(999)+1; // Test an empty file if (i == 8) { totalRecords = 0; } // Maximum bytes in a record of 100K int recordLength = random.nextInt(1024*100)+1; // For the 11th test, force a record length of 1 if (i == 10) { recordLength = 1; } // The total bytes in the test file int fileSize = (totalRecords * recordLength); LOG.info("totalRecords=" + totalRecords + " recordLength=" + recordLength); // Create the job JobConf job = new JobConf(defaultConf); if (codec != null) { ReflectionUtils.setConf(codec, job); } // Create the test file ArrayList<String> recordList = createFile(file, codec, recordLength, totalRecords); assertTrue(localFs.exists(file)); //set the fixed length record length config property for the job FixedLengthInputFormat.setRecordLength(job, recordLength); int numSplits = 1; // Arbitrarily set number of splits. if (i > 0) { if (i == (MAX_TESTS-1)) { // Test a split size that is less than record len numSplits = (int)(fileSize/Math.floor(recordLength/2)); } else { if (MAX_TESTS % i == 0) { // Let us create a split size that is forced to be // smaller than the end file itself, (ensures 1+ splits) numSplits = fileSize/(fileSize - random.nextInt(fileSize)); } else { // Just pick a random split size with no upper bound numSplits = Math.max(1, fileSize/random.nextInt(Integer.MAX_VALUE)); } } LOG.info("Number of splits set to: " + numSplits); } // Setup the input path FileInputFormat.setInputPaths(job, workDir); // Try splitting the file in a variety of sizes FixedLengthInputFormat format = new FixedLengthInputFormat(); format.configure(job); InputSplit splits[] = format.getSplits(job, numSplits); LOG.info("Actual number of splits = " + splits.length); // Test combined split lengths = total file size long recordOffset = 0; int recordNumber = 0; for (InputSplit split : splits) { RecordReader<LongWritable, BytesWritable> reader = format.getRecordReader(split, job, voidReporter); Class<?> clazz = reader.getClass(); assertEquals("RecordReader class should be FixedLengthRecordReader:", FixedLengthRecordReader.class, clazz); // Plow through the records in this split while (reader.next(key, value)) { assertEquals("Checking key", (long)(recordNumber*recordLength), key.get()); String valueString = new String(value.getBytes(), 0, value.getLength()); assertEquals("Checking record length:", recordLength, value.getLength()); assertTrue("Checking for more records than expected:", recordNumber < totalRecords); String origRecord = recordList.get(recordNumber); assertEquals("Checking record content:", origRecord, valueString); recordNumber++; } reader.close(); } assertEquals("Total original records should be total read records:", recordList.size(), recordNumber); } }
Example 17
Source File: TestConcatenatedCompressedInput.java From big-c with Apache License 2.0 | 4 votes |
/** * Test using Hadoop's original, native-zlib gzip codec for reading. */ @Test public void testGzip() throws IOException { JobConf jobConf = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, jobConf); localFs.delete(workDir, true); // preferred, but not compatible with Apache/trunk instance of Hudson: /* assertFalse("[native (C/C++) codec]", (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class == gzip.getDecompressorType()) ); System.out.println(COLOR_BR_RED + "testGzip() using native-zlib Decompressor (" + gzip.getDecompressorType() + ")" + COLOR_NORMAL); */ // alternative: if (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class == gzip.getDecompressorType()) { System.out.println(COLOR_BR_RED + "testGzip() using native-zlib Decompressor (" + gzip.getDecompressorType() + ")" + COLOR_NORMAL); } else { LOG.warn("testGzip() skipped: native (C/C++) libs not loaded"); return; } /* * // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs-- * // see https://issues.apache.org/jira/browse/HADOOP-6799 * Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension()); * //OutputStream out = localFs.create(fnHDFS); * //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out); * // can just combine those two lines, probably * //GzipCodec.GzipOutputStream gzOStm = * // new GzipCodec.GzipOutputStream(localFs.create(fnHDFS)); * // oops, no: this is a protected helper class; need to access * // it via createOutputStream() instead: * OutputStream out = localFs.create(fnHDFS); * Compressor gzCmp = gzip.createCompressor(); * CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp); * // this SHOULD be going to HDFS: got out from localFs == HDFS * // ...yup, works * gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("2nd gzip concat member\n".getBytes()); * gzOStm.finish(); * gzOStm.resetState(); * gzOStm.write("gzip concat\nmember #3\n".getBytes()); * gzOStm.close(); * // * String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension(); * Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn); * localFs.copyToLocalFile(fnHDFS, fnLocal); */ // copy prebuilt (correct!) version of concat.gz to HDFS final String fn = "concat" + gzip.getDefaultExtension(); Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn); Path fnHDFS = new Path(workDir, fn); localFs.copyFromLocalFile(fnLocal, fnHDFS); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n"); FileInputFormat.setInputPaths(jobConf, workDir); TextInputFormat format = new TextInputFormat(); format.configure(jobConf); InputSplit[] splits = format.getSplits(jobConf, 100); assertEquals("compressed splits == 2", 2, splits.length); FileSplit tmp = (FileSplit) splits[0]; if (tmp.getPath().getName().equals("part2.txt.gz")) { splits[0] = splits[1]; splits[1] = tmp; } List<Text> results = readSplit(format, splits[0], jobConf); assertEquals("splits[0] num lines", 6, results.size()); assertEquals("splits[0][5]", "member #3", results.get(5).toString()); results = readSplit(format, splits[1], jobConf); assertEquals("splits[1] num lines", 2, results.size()); assertEquals("splits[1][0]", "this is a test", results.get(0).toString()); assertEquals("splits[1][1]", "of gzip", results.get(1).toString()); }
Example 18
Source File: TestGroupedSplits.java From tez with Apache License 2.0 | 4 votes |
/** * Test using the gzip codec for reading */ @Test(timeout=10000) public void testGzip() throws IOException { JobConf job = new JobConf(defaultConf); CompressionCodec gzip = new GzipCodec(); ReflectionUtils.setConf(gzip, job); localFs.delete(workDir, true); writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n"); writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "is\ngzip\n"); writeFile(localFs, new Path(workDir, "part3.txt.gz"), gzip, "one\nmore\nsplit\n"); FileInputFormat.setInputPaths(job, workDir); TextInputFormat wrappedFormat = new TextInputFormat(); wrappedFormat.configure(job); TezGroupedSplitsInputFormat<LongWritable , Text> format = new TezGroupedSplitsInputFormat<LongWritable, Text>(); format.setConf(job); format.setInputFormat(wrappedFormat); // TextInputFormat will produce 3 splits for (int j=1; j<=3; ++j) { format.setDesiredNumberOfSplits(j); InputSplit[] splits = format.getSplits(job, 100); if (j==1) { // j==1 covers single split corner case // and does not do grouping assertEquals("compressed splits == " + j, j, splits.length); } List<Text> results = new ArrayList<Text>(); for (int i=0; i<splits.length; ++i) { List<Text> read = readSplit(format, splits[i], job); results.addAll(read); } assertEquals("splits length", 11, results.size()); final String[] firstList = {"the quick", "brown", "fox jumped", "over", " the lazy", " dog"}; final String[] secondList = {"is", "gzip"}; final String[] thirdList = {"one", "more", "split"}; String first = results.get(0).toString(); int start = 0; switch (first.charAt(0)) { case 't': start = testResults(results, firstList, start); break; case 'i': start = testResults(results, secondList, start); break; case 'o': start = testResults(results, thirdList, start); break; default: Assert.fail("unexpected first token - " + first); } } }
Example 19
Source File: TestConcatenatedCompressedInput.java From hadoop with Apache License 2.0 | 4 votes |
/** * Extended bzip2 test, similar to BuiltInGzipDecompressor test above. */ @Test public void testMoreBzip2() throws IOException { JobConf jobConf = new JobConf(defaultConf); CompressionCodec bzip2 = new BZip2Codec(); ReflectionUtils.setConf(bzip2, jobConf); localFs.delete(workDir, true); System.out.println(COLOR_BR_MAGENTA + "testMoreBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL); // copy single-member test file to HDFS String fn1 = "testConcatThenCompress.txt" + bzip2.getDefaultExtension(); Path fnLocal1 = new Path(System.getProperty("test.concat.data","/tmp"),fn1); Path fnHDFS1 = new Path(workDir, fn1); localFs.copyFromLocalFile(fnLocal1, fnHDFS1); // copy multiple-member test file to HDFS String fn2 = "testCompressThenConcat.txt" + bzip2.getDefaultExtension(); Path fnLocal2 = new Path(System.getProperty("test.concat.data","/tmp"),fn2); Path fnHDFS2 = new Path(workDir, fn2); localFs.copyFromLocalFile(fnLocal2, fnHDFS2); FileInputFormat.setInputPaths(jobConf, workDir); // here's first pair of BlockDecompressorStreams: final FileInputStream in1 = new FileInputStream(fnLocal1.toString()); final FileInputStream in2 = new FileInputStream(fnLocal2.toString()); assertEquals("concat bytes available", 2567, in1.available()); assertEquals("concat bytes available", 3056, in2.available()); /* // FIXME // The while-loop below dies at the beginning of the 2nd concatenated // member (after 17 lines successfully read) with: // // java.io.IOException: bad block header // at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock( // CBZip2InputStream.java:527) // // It is not critical to concatenated-gzip support, HADOOP-6835, so it's // simply commented out for now (and HADOOP-6852 filed). If and when the // latter issue is resolved--perhaps by fixing an error here--this code // should be reenabled. Note that the doMultipleBzip2BufferSizes() test // below uses the same testCompressThenConcat.txt.bz2 file but works fine. CompressionInputStream cin2 = bzip2.createInputStream(in2); LineReader in = new LineReader(cin2); Text out = new Text(); int numBytes, totalBytes=0, lineNum=0; while ((numBytes = in.readLine(out)) > 0) { ++lineNum; totalBytes += numBytes; } in.close(); assertEquals("total uncompressed bytes in concatenated test file", 5346, totalBytes); assertEquals("total uncompressed lines in concatenated test file", 84, lineNum); */ // test CBZip2InputStream with lots of different input-buffer sizes doMultipleBzip2BufferSizes(jobConf, false); // no native version of bzip2 codec (yet?) //doMultipleBzip2BufferSizes(jobConf, true); }
Example 20
Source File: TestFixedLengthInputFormat.java From big-c with Apache License 2.0 | 4 votes |
private void runRandomTests(CompressionCodec codec) throws IOException { StringBuilder fileName = new StringBuilder("testFormat.txt"); if (codec != null) { fileName.append(".gz"); } localFs.delete(workDir, true); Path file = new Path(workDir, fileName.toString()); int seed = new Random().nextInt(); LOG.info("Seed = " + seed); Random random = new Random(seed); int MAX_TESTS = 20; LongWritable key = new LongWritable(); BytesWritable value = new BytesWritable(); for (int i = 0; i < MAX_TESTS; i++) { LOG.info("----------------------------------------------------------"); // Maximum total records of 999 int totalRecords = random.nextInt(999)+1; // Test an empty file if (i == 8) { totalRecords = 0; } // Maximum bytes in a record of 100K int recordLength = random.nextInt(1024*100)+1; // For the 11th test, force a record length of 1 if (i == 10) { recordLength = 1; } // The total bytes in the test file int fileSize = (totalRecords * recordLength); LOG.info("totalRecords=" + totalRecords + " recordLength=" + recordLength); // Create the job JobConf job = new JobConf(defaultConf); if (codec != null) { ReflectionUtils.setConf(codec, job); } // Create the test file ArrayList<String> recordList = createFile(file, codec, recordLength, totalRecords); assertTrue(localFs.exists(file)); //set the fixed length record length config property for the job FixedLengthInputFormat.setRecordLength(job, recordLength); int numSplits = 1; // Arbitrarily set number of splits. if (i > 0) { if (i == (MAX_TESTS-1)) { // Test a split size that is less than record len numSplits = (int)(fileSize/Math.floor(recordLength/2)); } else { if (MAX_TESTS % i == 0) { // Let us create a split size that is forced to be // smaller than the end file itself, (ensures 1+ splits) numSplits = fileSize/(fileSize - random.nextInt(fileSize)); } else { // Just pick a random split size with no upper bound numSplits = Math.max(1, fileSize/random.nextInt(Integer.MAX_VALUE)); } } LOG.info("Number of splits set to: " + numSplits); } // Setup the input path FileInputFormat.setInputPaths(job, workDir); // Try splitting the file in a variety of sizes FixedLengthInputFormat format = new FixedLengthInputFormat(); format.configure(job); InputSplit splits[] = format.getSplits(job, numSplits); LOG.info("Actual number of splits = " + splits.length); // Test combined split lengths = total file size long recordOffset = 0; int recordNumber = 0; for (InputSplit split : splits) { RecordReader<LongWritable, BytesWritable> reader = format.getRecordReader(split, job, voidReporter); Class<?> clazz = reader.getClass(); assertEquals("RecordReader class should be FixedLengthRecordReader:", FixedLengthRecordReader.class, clazz); // Plow through the records in this split while (reader.next(key, value)) { assertEquals("Checking key", (long)(recordNumber*recordLength), key.get()); String valueString = new String(value.getBytes(), 0, value.getLength()); assertEquals("Checking record length:", recordLength, value.getLength()); assertTrue("Checking for more records than expected:", recordNumber < totalRecords); String origRecord = recordList.get(recordNumber); assertEquals("Checking record content:", origRecord, valueString); recordNumber++; } reader.close(); } assertEquals("Total original records should be total read records:", recordList.size(), recordNumber); } }