org.apache.hadoop.examples.WordCount.TokenizerMapper Java Examples
The following examples show how to use
org.apache.hadoop.examples.WordCount.TokenizerMapper.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MutiWordcount.java From big-c with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); long maxInputSplitSize = Long.parseLong(otherArgs[otherArgs.length-1]); long minInputSplitSize = Long.parseLong(otherArgs[otherArgs.length-2]); //added by wei conf.setLong(CombineTextInputFormat.SPLIT_MINSIZE_PERNODE, minInputSplitSize); conf.setLong(CombineTextInputFormat.SPLIT_MINSIZE_PERRACK, minInputSplitSize); //---- if (otherArgs.length < 2) { System.err.println("Usage: wordcount <in> [<in>...] <out>"); System.exit(2); } Job job = new Job(conf, "mulword count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); //added by wei job.setInputFormatClass(CombineTextInputFormat.class); CombineTextInputFormat.setMaxInputSplitSize(job, maxInputSplitSize); //---- for (int i = 0; i < otherArgs.length - 3; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 3])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
Example #2
Source File: TestMapReduceLocal.java From RDFS with Apache License 2.0 | 4 votes |
private void runWordCount(Configuration conf ) throws IOException, InterruptedException, ClassNotFoundException { final String COUNTER_GROUP = "org.apache.hadoop.mapred.Task$Counter"; localFs.delete(new Path(TEST_ROOT_DIR + "/in"), true); localFs.delete(new Path(TEST_ROOT_DIR + "/out"), true); writeFile("in/part1", "this is a test\nof word count test\ntest\n"); writeFile("in/part2", "more test"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(TrackingTextInputFormat.class); FileInputFormat.addInputPath(job, new Path(TEST_ROOT_DIR + "/in")); FileOutputFormat.setOutputPath(job, new Path(TEST_ROOT_DIR + "/out")); assertTrue(job.waitForCompletion(false)); String out = readFile("out/part-r-00000"); System.out.println(out); assertEquals("a\t1\ncount\t1\nis\t1\nmore\t1\nof\t1\ntest\t4\nthis\t1\nword\t1\n", out); Counters ctrs = job.getCounters(); System.out.println("Counters: " + ctrs); long combineIn = ctrs.findCounter(COUNTER_GROUP, "COMBINE_INPUT_RECORDS").getValue(); long combineOut = ctrs.findCounter(COUNTER_GROUP, "COMBINE_OUTPUT_RECORDS").getValue(); long reduceIn = ctrs.findCounter(COUNTER_GROUP, "REDUCE_INPUT_RECORDS").getValue(); long mapOut = ctrs.findCounter(COUNTER_GROUP, "MAP_OUTPUT_RECORDS").getValue(); long reduceOut = ctrs.findCounter(COUNTER_GROUP, "REDUCE_OUTPUT_RECORDS").getValue(); long reduceGrps = ctrs.findCounter(COUNTER_GROUP, "REDUCE_INPUT_GROUPS").getValue(); assertEquals("map out = combine in", mapOut, combineIn); assertEquals("combine out = reduce in", combineOut, reduceIn); assertTrue("combine in > combine out", combineIn > combineOut); assertEquals("reduce groups = reduce out", reduceGrps, reduceOut); String group = "Random Group"; CounterGroup ctrGrp = ctrs.getGroup(group); assertEquals(0, ctrGrp.size()); }
Example #3
Source File: TestMapReduceLocal.java From hadoop-gpu with Apache License 2.0 | 4 votes |
private void runWordCount(Configuration conf ) throws IOException, InterruptedException, ClassNotFoundException { final String COUNTER_GROUP = "org.apache.hadoop.mapred.Task$Counter"; localFs.delete(new Path(TEST_ROOT_DIR + "/in"), true); localFs.delete(new Path(TEST_ROOT_DIR + "/out"), true); writeFile("in/part1", "this is a test\nof word count test\ntest\n"); writeFile("in/part2", "more test"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(TrackingTextInputFormat.class); FileInputFormat.addInputPath(job, new Path(TEST_ROOT_DIR + "/in")); FileOutputFormat.setOutputPath(job, new Path(TEST_ROOT_DIR + "/out")); assertTrue(job.waitForCompletion(false)); String out = readFile("out/part-r-00000"); System.out.println(out); assertEquals("a\t1\ncount\t1\nis\t1\nmore\t1\nof\t1\ntest\t4\nthis\t1\nword\t1\n", out); Counters ctrs = job.getCounters(); System.out.println("Counters: " + ctrs); long combineIn = ctrs.findCounter(COUNTER_GROUP, "COMBINE_INPUT_RECORDS").getValue(); long combineOut = ctrs.findCounter(COUNTER_GROUP, "COMBINE_OUTPUT_RECORDS").getValue(); long reduceIn = ctrs.findCounter(COUNTER_GROUP, "REDUCE_INPUT_RECORDS").getValue(); long mapOut = ctrs.findCounter(COUNTER_GROUP, "MAP_OUTPUT_RECORDS").getValue(); assertEquals("map out = combine in", mapOut, combineIn); assertEquals("combine out = reduce in", combineOut, reduceIn); assertTrue("combine in > combine out", combineIn > combineOut); String group = "Random Group"; CounterGroup ctrGrp = ctrs.getGroup(group); assertEquals(0, ctrGrp.size()); }