org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer Java Exaples

Source File: PerformanceEvaluation.java From hbase with Apache License 2.0

5 votes

/**
 * Run a mapreduce job.  Run as many maps as asked-for clients.
 * Before we start up the job, write out an input file with instruction
 * per client regards which row they are to start on.
 * @param cmd Command to run.
 */
private void doMapReduce(final Class<? extends Test> cmd)
    throws IOException, InterruptedException, ClassNotFoundException {
  Configuration conf = getConf();
  Path inputDir = writeInputFile(conf);
  conf.set(EvaluationMapTask.CMD_KEY, cmd.getName());
  conf.set(EvaluationMapTask.PE_KEY, getClass().getName());
  Job job = Job.getInstance(conf);
  job.setJarByClass(PerformanceEvaluation.class);
  job.setJobName("HBase Performance Evaluation");

  job.setInputFormatClass(PeInputFormat.class);
  PeInputFormat.setInputPaths(job, inputDir);

  job.setOutputKeyClass(LongWritable.class);
  job.setOutputValueClass(LongWritable.class);

  job.setMapperClass(EvaluationMapTask.class);
  job.setReducerClass(LongSumReducer.class);
  job.setNumReduceTasks(1);

  job.setOutputFormatClass(TextOutputFormat.class);
  TextOutputFormat.setOutputPath(job, new Path(inputDir.getParent(), "outputs"));
  TableMapReduceUtil.addDependencyJars(job);
  TableMapReduceUtil.initCredentials(job);
  job.waitForCompletion(true);
}

Source File: WATServerType.java From cc-warc-examples with MIT License

5 votes

/**
 * Builds and runs the Hadoop job.
 * @return	0 if the Hadoop job completes successfully and 1 otherwise.
 */
@Override
public int run(String[] arg0) throws Exception {
	Configuration conf = getConf();
	//
	Job job = new Job(conf);
	job.setJarByClass(WATServerType.class);
	job.setNumReduceTasks(1);
	
	String inputPath = "data/*.warc.wat.gz";
	//inputPath = "s3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/wet/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.wet.gz";
	//inputPath = "s3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/wet/*.warc.wet.gz";
	LOG.info("Input path: " + inputPath);
	FileInputFormat.addInputPath(job, new Path(inputPath));
	
	String outputPath = "/tmp/cc/";
	FileSystem fs = FileSystem.newInstance(conf);
	if (fs.exists(new Path(outputPath))) {
		fs.delete(new Path(outputPath), true);
	}
	FileOutputFormat.setOutputPath(job, new Path(outputPath));
	
	job.setInputFormatClass(WARCFileInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);
	
	job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    
    job.setMapperClass(ServerTypeMap.ServerMapper.class);
    job.setReducerClass(LongSumReducer.class);
	
    if (job.waitForCompletion(true)) {
    	return 0;
    } else {
    	return 1;
    }
}

Source File: WETWordCount.java From cc-warc-examples with MIT License

5 votes

/**
 * Builds and runs the Hadoop job.
 * @return	0 if the Hadoop job completes successfully and 1 otherwise.
 */
@Override
public int run(String[] arg0) throws Exception {
	Configuration conf = getConf();
	//
	Job job = new Job(conf);
	job.setJarByClass(WETWordCount.class);
	job.setNumReduceTasks(1);
	
	String inputPath = "data/*.warc.wet.gz";
	//inputPath = "s3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/wet/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.wet.gz";
	//inputPath = "s3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/wet/*.warc.wet.gz";
	LOG.info("Input path: " + inputPath);
	FileInputFormat.addInputPath(job, new Path(inputPath));
	
	String outputPath = "/tmp/cc/";
	FileSystem fs = FileSystem.newInstance(conf);
	if (fs.exists(new Path(outputPath))) {
		fs.delete(new Path(outputPath), true);
	}
	FileOutputFormat.setOutputPath(job, new Path(outputPath));
	
	job.setInputFormatClass(WARCFileInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);
	
	job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    
    job.setMapperClass(WordCounterMap.WordCountMapper.class);
    // The reducer is quite useful in the word frequency task 
    job.setReducerClass(LongSumReducer.class);
	
    if (job.waitForCompletion(true)) {
    	return 0;
    } else {
    	return 1;
    }
}

Source File: WARCTagCounter.java From cc-warc-examples with MIT License

5 votes

/**
 * Builds and runs the Hadoop job.
 * @return	0 if the Hadoop job completes successfully and 1 otherwise.
 */
@Override
public int run(String[] arg0) throws Exception {
	Configuration conf = getConf();
	//
	Job job = new Job(conf);
	job.setJarByClass(WARCTagCounter.class);
	job.setNumReduceTasks(1);
	
	String inputPath = "data/*.warc.gz";
	//inputPath = "s3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/wet/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.wet.gz";
	//inputPath = "s3n://aws-publicdatasets/common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/wet/*.warc.wet.gz";
	LOG.info("Input path: " + inputPath);
	FileInputFormat.addInputPath(job, new Path(inputPath));
	
	String outputPath = "/tmp/cc/";
	FileSystem fs = FileSystem.newInstance(conf);
	if (fs.exists(new Path(outputPath))) {
		fs.delete(new Path(outputPath), true);
	}
	FileOutputFormat.setOutputPath(job, new Path(outputPath));

	job.setInputFormatClass(WARCFileInputFormat.class);
	job.setOutputFormatClass(TextOutputFormat.class);

	job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    
    job.setMapperClass(TagCounterMap.TagCounterMapper.class);
    job.setReducerClass(LongSumReducer.class);

    return job.waitForCompletion(true) ? 0 : -1;
}

Source File: Wordcount.java From logparser with Apache License 2.0

5 votes

@Override
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args)
            .getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        return 2;
    }

    conf.set("nl.basjes.parse.apachehttpdlogline.format", logFormat);

    // A ',' separated list of fields
    conf.set("nl.basjes.parse.apachehttpdlogline.fields",
            "STRING:request.status.last");

    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(Wordcount.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

    job.setInputFormatClass(ApacheHttpdLogfileInputFormat.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    // configuration should contain reference to your namenode
    FileSystem fs = FileSystem.get(conf);
    // true stands for recursively deleting the folder you gave
    Path outputPath = new Path(otherArgs[1]);
    fs.delete(outputPath, true);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    if (job.waitForCompletion(true)) {
        return 0;
    }
    return 1;
}

Source File: DBCountPageView.java From hadoop with Apache License 2.0

4 votes

@Override
//Usage DBCountPageView [driverClass dburl]
public int run(String[] args) throws Exception {
  
  String driverClassName = DRIVER_CLASS;
  String url = DB_URL;
  
  if(args.length > 1) {
    driverClassName = args[0];
    url = args[1];
  }
  
  initialize(driverClassName, url);
  Configuration conf = getConf();

  DBConfiguration.configureDB(conf, driverClassName, url);

  Job job = new Job(conf);
      
  job.setJobName("Count Pageviews of URLs");
  job.setJarByClass(DBCountPageView.class);
  job.setMapperClass(PageviewMapper.class);
  job.setCombinerClass(LongSumReducer.class);
  job.setReducerClass(PageviewReducer.class);

  DBInputFormat.setInput(job, AccessRecord.class, "Access"
      , null, "url", AccessFieldNames);

  DBOutputFormat.setOutput(job, "Pageview", PageviewFieldNames);
  
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(LongWritable.class);

  job.setOutputKeyClass(PageviewRecord.class);
  job.setOutputValueClass(NullWritable.class);
  int ret;
  try {
    ret = job.waitForCompletion(true) ? 0 : 1;
    boolean correct = verify();
    if(!correct) {
      throw new RuntimeException("Evaluation was not correct!");
    }
  } finally {
    shutdown();    
  }
  return ret;
}

Source File: Grep.java From hadoop with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
  if (args.length < 3) {
    System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
    ToolRunner.printGenericCommandUsage(System.out);
    return 2;
  }

  Path tempDir =
    new Path("grep-temp-"+
        Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

  Configuration conf = getConf();
  conf.set(RegexMapper.PATTERN, args[2]);
  if (args.length == 4)
    conf.set(RegexMapper.GROUP, args[3]);

  Job grepJob = Job.getInstance(conf);
  
  try {
    
    grepJob.setJobName("grep-search");
    grepJob.setJarByClass(Grep.class);

    FileInputFormat.setInputPaths(grepJob, args[0]);

    grepJob.setMapperClass(RegexMapper.class);

    grepJob.setCombinerClass(LongSumReducer.class);
    grepJob.setReducerClass(LongSumReducer.class);

    FileOutputFormat.setOutputPath(grepJob, tempDir);
    grepJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    grepJob.setOutputKeyClass(Text.class);
    grepJob.setOutputValueClass(LongWritable.class);

    grepJob.waitForCompletion(true);

    Job sortJob = Job.getInstance(conf);
    sortJob.setJobName("grep-sort");
    sortJob.setJarByClass(Grep.class);

    FileInputFormat.setInputPaths(sortJob, tempDir);
    sortJob.setInputFormatClass(SequenceFileInputFormat.class);

    sortJob.setMapperClass(InverseMapper.class);

    sortJob.setNumReduceTasks(1);                 // write a single file
    FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
    sortJob.setSortComparatorClass(          // sort by decreasing freq
      LongWritable.DecreasingComparator.class);

    sortJob.waitForCompletion(true);
  }
  finally {
    FileSystem.get(conf).delete(tempDir, true);
  }
  return 0;
}

Source File: DBCountPageView.java From big-c with Apache License 2.0

4 votes

@Override
//Usage DBCountPageView [driverClass dburl]
public int run(String[] args) throws Exception {
  
  String driverClassName = DRIVER_CLASS;
  String url = DB_URL;
  
  if(args.length > 1) {
    driverClassName = args[0];
    url = args[1];
  }
  
  initialize(driverClassName, url);
  Configuration conf = getConf();

  DBConfiguration.configureDB(conf, driverClassName, url);

  Job job = new Job(conf);
      
  job.setJobName("Count Pageviews of URLs");
  job.setJarByClass(DBCountPageView.class);
  job.setMapperClass(PageviewMapper.class);
  job.setCombinerClass(LongSumReducer.class);
  job.setReducerClass(PageviewReducer.class);

  DBInputFormat.setInput(job, AccessRecord.class, "Access"
      , null, "url", AccessFieldNames);

  DBOutputFormat.setOutput(job, "Pageview", PageviewFieldNames);
  
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(LongWritable.class);

  job.setOutputKeyClass(PageviewRecord.class);
  job.setOutputValueClass(NullWritable.class);
  int ret;
  try {
    ret = job.waitForCompletion(true) ? 0 : 1;
    boolean correct = verify();
    if(!correct) {
      throw new RuntimeException("Evaluation was not correct!");
    }
  } finally {
    shutdown();    
  }
  return ret;
}

Source File: Grep.java From big-c with Apache License 2.0

4 votes

public int run(String[] args) throws Exception {
  if (args.length < 3) {
    System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
    ToolRunner.printGenericCommandUsage(System.out);
    return 2;
  }

  Path tempDir =
    new Path("grep-temp-"+
        Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

  Configuration conf = getConf();
  conf.set(RegexMapper.PATTERN, args[2]);
  if (args.length == 4)
    conf.set(RegexMapper.GROUP, args[3]);

  Job grepJob = Job.getInstance(conf);
  
  try {
    
    grepJob.setJobName("grep-search");
    grepJob.setJarByClass(Grep.class);

    FileInputFormat.setInputPaths(grepJob, args[0]);

    grepJob.setMapperClass(RegexMapper.class);

    grepJob.setCombinerClass(LongSumReducer.class);
    grepJob.setReducerClass(LongSumReducer.class);

    FileOutputFormat.setOutputPath(grepJob, tempDir);
    grepJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    grepJob.setOutputKeyClass(Text.class);
    grepJob.setOutputValueClass(LongWritable.class);

    grepJob.waitForCompletion(true);

    Job sortJob = Job.getInstance(conf);
    sortJob.setJobName("grep-sort");
    sortJob.setJarByClass(Grep.class);

    FileInputFormat.setInputPaths(sortJob, tempDir);
    sortJob.setInputFormatClass(SequenceFileInputFormat.class);

    sortJob.setMapperClass(InverseMapper.class);

    sortJob.setNumReduceTasks(1);                 // write a single file
    FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
    sortJob.setSortComparatorClass(          // sort by decreasing freq
      LongWritable.DecreasingComparator.class);

    sortJob.waitForCompletion(true);
  }
  finally {
    FileSystem.get(conf).delete(tempDir, true);
  }
  return 0;
}

Source File: BasicJobChaining.java From hadoop-map-reduce-patterns with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {
	Configuration conf = new Configuration();
	String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

	if (otherArgs.length != 3) {
		System.err.println("Usage: JobChainingDriver <posts> <users> <out>");
		System.exit(2);
	}

	Path postInput = new Path(otherArgs[0]);
	Path userInput = new Path(otherArgs[1]);
	Path outputDirIntermediate = new Path(otherArgs[2] + "_int");
	Path outputDir = new Path(otherArgs[2]);

	// Setup first job to counter user posts
	Job countingJob = new Job(conf, "JobChaining-Counting");
	countingJob.setJarByClass(BasicJobChaining.class);

	// Set our mapper and reducer, we can use the API's long sum reducer for
	// a combiner!
	countingJob.setMapperClass(UserIdCountMapper.class);
	countingJob.setCombinerClass(LongSumReducer.class);
	countingJob.setReducerClass(UserIdSumReducer.class);

	countingJob.setOutputKeyClass(Text.class);
	countingJob.setOutputValueClass(LongWritable.class);

	countingJob.setInputFormatClass(TextInputFormat.class);

	TextInputFormat.addInputPath(countingJob, postInput);

	countingJob.setOutputFormatClass(TextOutputFormat.class);
	TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate);

	// Execute job and grab exit code
	int code = countingJob.waitForCompletion(true) ? 0 : 1;

	if (code == 0) {
		// Calculate the average posts per user by getting counter values
		double numRecords = (double) countingJob.getCounters()
				.findCounter(AVERAGE_CALC_GROUP, UserIdCountMapper.RECORDS_COUNTER_NAME)
				.getValue();
		double numUsers = (double) countingJob.getCounters()
				.findCounter(AVERAGE_CALC_GROUP, UserIdSumReducer.USERS_COUNTER_NAME)
				.getValue();

		double averagePostsPerUser = numRecords / numUsers;

		// Setup binning job
		Job binningJob = new Job(new Configuration(), "JobChaining-Binning");
		binningJob.setJarByClass(BasicJobChaining.class);

		// Set mapper and the average posts per user
		binningJob.setMapperClass(UserIdBinningMapper.class);
		UserIdBinningMapper.setAveragePostsPerUser(binningJob, averagePostsPerUser);

		binningJob.setNumReduceTasks(0);

		binningJob.setInputFormatClass(TextInputFormat.class);
		TextInputFormat.addInputPath(binningJob, outputDirIntermediate);

		// Add two named outputs for below/above average
		MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_BELOW_NAME,
				TextOutputFormat.class, Text.class, Text.class);

		MultipleOutputs.addNamedOutput(binningJob, MULTIPLE_OUTPUTS_ABOVE_NAME,
				TextOutputFormat.class, Text.class, Text.class);
		MultipleOutputs.setCountersEnabled(binningJob, true);

		TextOutputFormat.setOutputPath(binningJob, outputDir);

		// Add the user files to the DistributedCache
		FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput);
		for (FileStatus status : userFiles) {
			DistributedCache.addCacheFile(status.getPath().toUri(),
					binningJob.getConfiguration());
		}

		// Execute job and grab exit code
		code = binningJob.waitForCompletion(true) ? 0 : 1;
	}

	// Clean up the intermediate output
	FileSystem.get(conf).delete(outputDirIntermediate, true);

	System.exit(code);
}

org.apache.hadoop.mapreduce.lib.reduce.LongSumReducer Java Examples