Java Code Examples for org.apache.hadoop.mapred.JobConf#setNumMapTasks()
The following examples show how to use
org.apache.hadoop.mapred.JobConf#setNumMapTasks() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DistCp.java From RDFS with Apache License 2.0 | 6 votes |
/** * Calculate how many maps to run. * Number of maps is bounded by a minimum of the cumulative size of the * copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the * command line) and at most (distcp.max.map.tasks, default * MAX_MAPS_PER_NODE * nodes in the cluster). * @param totalBytes Count of total bytes for job * @param job The job to configure * @param client JobClient object to access the cluster * @return Count of maps to run. */ private static int setMapCount(long totalBytes, JobConf job, JobClient client) throws IOException { int numMaps = (int)(totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP)); int numTasks = MAX_MAPS_DEFAULT; try { numTasks = client.getClusterStatus().getTaskTrackers(); } catch (UnsupportedOperationException uex) { // This is corona client that does not support the getClusterStatus() } numMaps = Math.min(numMaps, job.getInt(MAX_MAPS_LABEL, MAX_MAPS_PER_NODE * numTasks)); job.setNumMapTasks(Math.max(numMaps, 1)); return Math.max(numMaps, 1); }
Example 2
Source File: JobControlTestUtils.java From hadoop-gpu with Apache License 2.0 | 6 votes |
/** * Creates a simple copy job. * * @param indirs List of input directories. * @param outdir Output directory. * @return JobConf initialised for a simple copy job. * @throws Exception If an error occurs creating job configuration. */ static JobConf createCopyJob(List<Path> indirs, Path outdir) throws Exception { Configuration defaults = new Configuration(); JobConf theJob = new JobConf(defaults, TestJobControl.class); theJob.setJobName("DataMoveJob"); FileInputFormat.setInputPaths(theJob, indirs.toArray(new Path[0])); theJob.setMapperClass(DataCopy.class); FileOutputFormat.setOutputPath(theJob, outdir); theJob.setOutputKeyClass(Text.class); theJob.setOutputValueClass(Text.class); theJob.setReducerClass(DataCopy.class); theJob.setNumMapTasks(12); theJob.setNumReduceTasks(4); return theJob; }
Example 3
Source File: JobControlTestUtils.java From big-c with Apache License 2.0 | 6 votes |
/** * Creates a simple copy job. * * @param indirs List of input directories. * @param outdir Output directory. * @return JobConf initialised for a simple copy job. * @throws Exception If an error occurs creating job configuration. */ static JobConf createCopyJob(List<Path> indirs, Path outdir) throws Exception { Configuration defaults = new Configuration(); JobConf theJob = new JobConf(defaults, TestJobControl.class); theJob.setJobName("DataMoveJob"); FileInputFormat.setInputPaths(theJob, indirs.toArray(new Path[0])); theJob.setMapperClass(DataCopy.class); FileOutputFormat.setOutputPath(theJob, outdir); theJob.setOutputKeyClass(Text.class); theJob.setOutputValueClass(Text.class); theJob.setReducerClass(DataCopy.class); theJob.setNumMapTasks(12); theJob.setNumReduceTasks(4); return theJob; }
Example 4
Source File: SliveTest.java From big-c with Apache License 2.0 | 6 votes |
/** * Sets up a job conf for the given job using the given config object. Ensures * that the correct input format is set, the mapper and and reducer class and * the input and output keys and value classes along with any other job * configuration. * * @param config * @return JobConf representing the job to be ran * @throws IOException */ private JobConf getJob(ConfigExtractor config) throws IOException { JobConf job = new JobConf(config.getConfig(), SliveTest.class); job.setInputFormat(DummyInputFormat.class); FileOutputFormat.setOutputPath(job, config.getOutputPath()); job.setMapperClass(SliveMapper.class); job.setPartitionerClass(SlivePartitioner.class); job.setReducerClass(SliveReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setCompressOutput(job, false); job.setNumReduceTasks(config.getReducerAmount()); job.setNumMapTasks(config.getMapAmount()); return job; }
Example 5
Source File: TableMapReduceUtil.java From hbase with Apache License 2.0 | 5 votes |
/** * Ensures that the given number of map tasks for the given job * configuration does not exceed the number of regions for the given table. * * @param table The table to get the region count for. * @param job The current job configuration to adjust. * @throws IOException When retrieving the table details fails. */ // Used by tests. public static void limitNumMapTasks(String table, JobConf job) throws IOException { int regions = MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table)); if (job.getNumMapTasks() > regions) job.setNumMapTasks(regions); }
Example 6
Source File: UtilsForTests.java From RDFS with Apache License 2.0 | 5 votes |
static RunningJob runJob(JobConf conf, Path inDir, Path outDir, int numMaps, int numReds) throws IOException { FileSystem fs = FileSystem.get(conf); if (fs.exists(outDir)) { fs.delete(outDir, true); } if (!fs.exists(inDir)) { fs.mkdirs(inDir); } String input = "The quick brown fox\n" + "has many silly\n" + "red fox sox\n"; for (int i = 0; i < numMaps; ++i) { DataOutputStream file = fs.create(new Path(inDir, "part-" + i)); file.writeBytes(input); file.close(); } conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, inDir); FileOutputFormat.setOutputPath(conf, outDir); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); JobClient jobClient = new JobClient(conf); RunningJob job = jobClient.submitJob(conf); return job; }
Example 7
Source File: DistCpV1.java From big-c with Apache License 2.0 | 5 votes |
/** * Calculate how many maps to run. * Number of maps is bounded by a minimum of the cumulative size of the * copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the * command line) and at most (distcp.max.map.tasks, default * MAX_MAPS_PER_NODE * nodes in the cluster). * @param totalBytes Count of total bytes for job * @param job The job to configure * @return Count of maps to run. */ private static int setMapCount(long totalBytes, JobConf job) throws IOException { int numMaps = (int)(totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP)); numMaps = Math.min(numMaps, job.getInt(MAX_MAPS_LABEL, MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers())); numMaps = Math.max(numMaps, 1); job.setNumMapTasks(numMaps); return numMaps; }
Example 8
Source File: CloudBurst.java From emr-sample-apps with Apache License 2.0 | 5 votes |
public static void filter(String alignpath, String outpath, int nummappers, int numreducers) throws IOException, Exception { System.out.println("NUM_FMAP_TASKS: " + nummappers); System.out.println("NUM_FREDUCE_TASKS: " + numreducers); JobConf conf = new JobConf(FilterAlignments.class); conf.setJobName("FilterAlignments"); conf.setNumMapTasks(nummappers); conf.setNumReduceTasks(numreducers); FileInputFormat.addInputPath(conf, new Path(alignpath)); conf.setMapperClass(FilterMapClass.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(BytesWritable.class); conf.setCombinerClass(FilterCombinerClass.class); conf.setReducerClass(FilterReduceClass.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); Path oPath = new Path(outpath); FileOutputFormat.setOutputPath(conf, oPath); System.err.println(" Removing old results"); FileSystem.get(conf).delete(oPath); JobClient.runJob(conf); System.err.println("FilterAlignments Finished"); }
Example 9
Source File: TestMROldApiJobs.java From big-c with Apache License 2.0 | 5 votes |
static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps, int numReds) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(conf); if (fs.exists(outDir)) { fs.delete(outDir, true); } if (!fs.exists(inDir)) { fs.mkdirs(inDir); } String input = "The quick brown fox\n" + "has many silly\n" + "red fox sox\n"; for (int i = 0; i < numMaps; ++i) { DataOutputStream file = fs.create(new Path(inDir, "part-" + i)); file.writeBytes(input); file.close(); } DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs); conf.setOutputCommitter(CustomOutputCommitter.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, inDir); FileOutputFormat.setOutputPath(conf, outDir); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); JobClient jobClient = new JobClient(conf); RunningJob job = jobClient.submitJob(conf); return jobClient.monitorAndPrintJob(conf, job); }
Example 10
Source File: CopyFromS3.java From emr-sample-apps with Apache License 2.0 | 5 votes |
/** * This method constructs the JobConf to be used to run the map reduce job to * download the files from S3. This is a potentially expensive method since it * makes multiple calls to S3 to get a listing of all the input data. Clients * are encouraged to cache the returned JobConf reference and not call this * method multiple times unless necessary. * * @return the JobConf to be used to run the map reduce job to download the * files from S3. */ public JobConf getJobConf() throws IOException, ParseException { JobConf conf = new JobConf(CopyFromS3.class); conf.setJobName("CopyFromS3"); conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(S3CopyMapper.class); // We configure a reducer, even though we don't use it right now. // The idea is that, in the future we may. conf.setReducerClass(HDFSWriterReducer.class); conf.setNumReduceTasks(0); FileInputFormat.setInputPaths(conf, new Path(tempFile)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setOutputFormat(TextOutputFormat.class); conf.setCompressMapOutput(true); JobClient jobClient = new JobClient(conf); FileSystem inputFS = FileSystem.get(URI.create(inputPathPrefix), conf); DatePathFilter datePathFilter = new DatePathFilter(startDate, endDate); List<Path> filePaths = getFilePaths(inputFS, new Path(inputPathPrefix), datePathFilter, jobClient.getDefaultMaps()); // Write the file names to a temporary index file to be used // as input to the map tasks. FileSystem outputFS = FileSystem.get(URI.create(tempFile), conf); FSDataOutputStream outputStream = outputFS.create(new Path(tempFile), true); try { for (Path path : filePaths) { outputStream.writeBytes(path.toString() + "\n"); } } finally { outputStream.close(); } conf.setNumMapTasks(Math.min(filePaths.size(), jobClient.getDefaultMaps())); return conf; }
Example 11
Source File: DistCpV1.java From hadoop with Apache License 2.0 | 5 votes |
/** * Calculate how many maps to run. * Number of maps is bounded by a minimum of the cumulative size of the * copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the * command line) and at most (distcp.max.map.tasks, default * MAX_MAPS_PER_NODE * nodes in the cluster). * @param totalBytes Count of total bytes for job * @param job The job to configure * @return Count of maps to run. */ private static int setMapCount(long totalBytes, JobConf job) throws IOException { int numMaps = (int)(totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP)); numMaps = Math.min(numMaps, job.getInt(MAX_MAPS_LABEL, MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers())); numMaps = Math.max(numMaps, 1); job.setNumMapTasks(numMaps); return numMaps; }
Example 12
Source File: TestMROldApiJobs.java From hadoop with Apache License 2.0 | 5 votes |
static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps, int numReds) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(conf); if (fs.exists(outDir)) { fs.delete(outDir, true); } if (!fs.exists(inDir)) { fs.mkdirs(inDir); } String input = "The quick brown fox\n" + "has many silly\n" + "red fox sox\n"; for (int i = 0; i < numMaps; ++i) { DataOutputStream file = fs.create(new Path(inDir, "part-" + i)); file.writeBytes(input); file.close(); } DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs); conf.setOutputCommitter(CustomOutputCommitter.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, inDir); FileOutputFormat.setOutputPath(conf, outDir); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); JobClient jobClient = new JobClient(conf); RunningJob job = jobClient.submitJob(conf); return jobClient.monitorAndPrintJob(conf, job); }
Example 13
Source File: LoadGeneratorMR.java From big-c with Apache License 2.0 | 4 votes |
/** * Based on args we submit the LoadGenerator as MR job. * Number of MapTasks is numMapTasks * @return exitCode for job submission */ private int submitAsMapReduce() { System.out.println("Running as a MapReduce job with " + numMapTasks + " mapTasks; Output to file " + mrOutDir); Configuration conf = new Configuration(getConf()); // First set all the args of LoadGenerator as Conf vars to pass to MR tasks conf.set(LG_ROOT , root.toString()); conf.setInt(LG_MAXDELAYBETWEENOPS, maxDelayBetweenOps); conf.setInt(LG_NUMOFTHREADS, numOfThreads); conf.set(LG_READPR, readProbs[0]+""); //Pass Double as string conf.set(LG_WRITEPR, writeProbs[0]+""); //Pass Double as string conf.setLong(LG_SEED, seed); //No idea what this is conf.setInt(LG_NUMMAPTASKS, numMapTasks); if (scriptFile == null && durations[0] <=0) { System.err.println("When run as a MapReduce job, elapsed Time or ScriptFile must be specified"); System.exit(-1); } conf.setLong(LG_ELAPSEDTIME, durations[0]); conf.setLong(LG_STARTTIME, startTime); if (scriptFile != null) { conf.set(LG_SCRIPTFILE , scriptFile); } conf.set(LG_FLAGFILE, flagFile.toString()); // Now set the necessary conf variables that apply to run MR itself. JobConf jobConf = new JobConf(conf, LoadGenerator.class); jobConf.setJobName("NNLoadGeneratorViaMR"); jobConf.setNumMapTasks(numMapTasks); jobConf.setNumReduceTasks(1); // 1 reducer to collect the results jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(IntWritable.class); jobConf.setMapperClass(MapperThatRunsNNLoadGenerator.class); jobConf.setReducerClass(ReducerThatCollectsLGdata.class); jobConf.setInputFormat(DummyInputFormat.class); jobConf.setOutputFormat(TextOutputFormat.class); // Explicitly set number of max map attempts to 1. jobConf.setMaxMapAttempts(1); // Explicitly turn off speculative execution jobConf.setSpeculativeExecution(false); // This mapReduce job has no input but has output FileOutputFormat.setOutputPath(jobConf, new Path(mrOutDir)); try { JobClient.runJob(jobConf); } catch (IOException e) { System.err.println("Failed to run job: " + e.getMessage()); return -1; } return 0; }
Example 14
Source File: MRSharedCaching.java From RDFS with Apache License 2.0 | 4 votes |
public static FileSystem setupJob(String indir, String outdir, String cacheDir, JobConf conf, String input, boolean withSymlink) throws IOException { final Path inDir = new Path(indir); final Path outDir = new Path(outdir); FileSystem fs = FileSystem.get(conf); fs.delete(outDir, true); if (!fs.mkdirs(inDir)) { throw new IOException("Mkdirs failed to create " + inDir.toString()); } { DataOutputStream file = fs.create(new Path(inDir, "part-0")); file.writeBytes(input); file.close(); } conf.setJobName("sharedcachetest"); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are counts (ints) conf.setOutputValueClass(IntWritable.class); conf.setCombinerClass(MRSharedCaching.ReduceClass.class); conf.setReducerClass(MRSharedCaching.ReduceClass.class); FileInputFormat.setInputPaths(conf, inDir); FileOutputFormat.setOutputPath(conf, outDir); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setSpeculativeExecution(false); if (!withSymlink) { conf.setMapperClass(MRSharedCaching.MapClass.class); } else { conf.setMapperClass(MRSharedCaching.MapClass2.class); } // Turn on sharing conf.set("mapred.cache.shared.enabled", "true"); return fs; }
Example 15
Source File: RandomTextWriter.java From RDFS with Apache License 2.0 | 4 votes |
/** * This is the main routine for launching a distributed random write job. * It runs 10 maps/node and each node writes 1 gig of data to a DFS file. * The reduce doesn't do anything. * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { return printUsage(); } JobConf job = new JobConf(getConf()); job.setJarByClass(RandomTextWriter.class); job.setJobName("random-text-writer"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(RandomWriter.RandomInputFormat.class); job.setMapperClass(Map.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = job.getInt("test.randomtextwrite.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomtextwrite.bytes_per_map", 1*1024*1024*1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomtextwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong("test.randomtextwrite.total_bytes", numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomtextwrite.bytes_per_map", totalBytesToWrite); } Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; List<String> otherArgs = new ArrayList<String>(); for(int i=0; i < args.length; ++i) { try { if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else { otherArgs.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i-1]); return printUsage(); // exits } } job.setOutputFormat(outputFormatClass); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0))); job.setNumMapTasks(numMaps); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) /1000 + " seconds."); return 0; }
Example 16
Source File: PiEstimator.java From hadoop-gpu with Apache License 2.0 | 4 votes |
/** * Run a map/reduce job for estimating Pi. * * @return the estimated value of Pi */ public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf ) throws IOException { //setup job conf jobConf.setJobName(PiEstimator.class.getSimpleName()); jobConf.setInputFormat(SequenceFileInputFormat.class); jobConf.setOutputKeyClass(BooleanWritable.class); jobConf.setOutputValueClass(LongWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.setMapperClass(PiMapper.class); jobConf.setNumMapTasks(numMaps); jobConf.setReducerClass(PiReducer.class); jobConf.setNumReduceTasks(1); // turn off speculative execution, because DFS doesn't handle // multiple writers to the same file. jobConf.setSpeculativeExecution(false); //setup input/output directories final Path inDir = new Path(TMP_DIR, "in"); final Path outDir = new Path(TMP_DIR, "out"); FileInputFormat.setInputPaths(jobConf, inDir); FileOutputFormat.setOutputPath(jobConf, outDir); final FileSystem fs = FileSystem.get(jobConf); if (fs.exists(TMP_DIR)) { throw new IOException("Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists. Please remove it first."); } if (!fs.mkdirs(inDir)) { throw new IOException("Cannot create input directory " + inDir); } try { //generate an input file for each map task for(int i=0; i < numMaps; ++i) { final Path file = new Path(inDir, "part"+i); final LongWritable offset = new LongWritable(i * numPoints); final LongWritable size = new LongWritable(numPoints); final SequenceFile.Writer writer = SequenceFile.createWriter( fs, jobConf, file, LongWritable.class, LongWritable.class, CompressionType.NONE); try { writer.append(offset, size); } finally { writer.close(); } System.out.println("Wrote input for Map #"+i); } //start a map/reduce job System.out.println("Starting Job"); final long startTime = System.currentTimeMillis(); JobClient.runJob(jobConf); final double duration = (System.currentTimeMillis() - startTime)/1000.0; System.out.println("Job Finished in " + duration + " seconds"); //read outputs Path inFile = new Path(outDir, "reduce-out"); LongWritable numInside = new LongWritable(); LongWritable numOutside = new LongWritable(); SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf); try { reader.next(numInside, numOutside); } finally { reader.close(); } //compute estimated value return BigDecimal.valueOf(4).setScale(20) .multiply(BigDecimal.valueOf(numInside.get())) .divide(BigDecimal.valueOf(numMaps)) .divide(BigDecimal.valueOf(numPoints)); } finally { fs.delete(TMP_DIR, true); } }
Example 17
Source File: TestPipes.java From hadoop-gpu with Apache License 2.0 | 4 votes |
private void runProgram(MiniMRCluster mr, MiniDFSCluster dfs, Path program, Path inputPath, Path outputPath, int numMaps, int numReduces, String[] expectedResults ) throws IOException { Path wordExec = new Path("/testing/bin/application"); JobConf job = mr.createJobConf(); job.setNumMapTasks(numMaps); job.setNumReduceTasks(numReduces); { FileSystem fs = dfs.getFileSystem(); fs.delete(wordExec.getParent(), true); fs.copyFromLocalFile(program, wordExec); Submitter.setExecutable(job, fs.makeQualified(wordExec).toString()); Submitter.setIsJavaRecordReader(job, true); Submitter.setIsJavaRecordWriter(job, true); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); RunningJob rJob = null; if (numReduces == 0) { rJob = Submitter.jobSubmit(job); while (!rJob.isComplete()) { try { Thread.sleep(1000); } catch (InterruptedException ie) { throw new RuntimeException(ie); } } } else { rJob = Submitter.runJob(job); } assertTrue("pipes job failed", rJob.isSuccessful()); Counters counters = rJob.getCounters(); Counters.Group wordCountCounters = counters.getGroup("WORDCOUNT"); int numCounters = 0; for (Counter c : wordCountCounters) { System.out.println(c); ++numCounters; } assertTrue("No counters found!", (numCounters > 0)); } List<String> results = new ArrayList<String>(); for (Path p:FileUtil.stat2Paths(dfs.getFileSystem().listStatus(outputPath, new OutputLogFilter()))) { results.add(TestMiniMRWithDFS.readOutput(p, job)); } assertEquals("number of reduces is wrong", expectedResults.length, results.size()); for(int i=0; i < results.size(); i++) { assertEquals("pipes program " + program + " output " + i + " wrong", expectedResults[i], results.get(i)); } }
Example 18
Source File: DataJoinJob.java From big-c with Apache License 2.0 | 4 votes |
public static JobConf createDataJoinJob(String args[]) throws IOException { String inputDir = args[0]; String outputDir = args[1]; Class inputFormat = SequenceFileInputFormat.class; if (args[2].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileInputFormat: " + args[2]); } else { System.out.println("Using TextInputFormat: " + args[2]); inputFormat = TextInputFormat.class; } int numOfReducers = Integer.parseInt(args[3]); Class mapper = getClassByName(args[4]); Class reducer = getClassByName(args[5]); Class mapoutputValueClass = getClassByName(args[6]); Class outputFormat = TextOutputFormat.class; Class outputValueClass = Text.class; if (args[7].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileOutputFormat: " + args[7]); outputFormat = SequenceFileOutputFormat.class; outputValueClass = getClassByName(args[7]); } else { System.out.println("Using TextOutputFormat: " + args[7]); } long maxNumOfValuesPerGroup = 100; String jobName = ""; if (args.length > 8) { maxNumOfValuesPerGroup = Long.parseLong(args[8]); } if (args.length > 9) { jobName = args[9]; } Configuration defaults = new Configuration(); JobConf job = new JobConf(defaults, DataJoinJob.class); job.setJobName("DataJoinJob: " + jobName); FileSystem fs = FileSystem.get(defaults); fs.delete(new Path(outputDir), true); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormat(inputFormat); job.setMapperClass(mapper); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormat(outputFormat); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(mapoutputValueClass); job.setOutputKeyClass(Text.class); job.setOutputValueClass(outputValueClass); job.setReducerClass(reducer); job.setNumMapTasks(1); job.setNumReduceTasks(numOfReducers); job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup); return job; }
Example 19
Source File: RandomWriter.java From hadoop-gpu with Apache License 2.0 | 4 votes |
/** * This is the main routine for launching a distributed random write job. * It runs 10 maps/node and each node writes 1 gig of data to a DFS file. * The reduce doesn't do anything. * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: writer <out-dir>"); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path outDir = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(RandomWriter.class); job.setJobName("random-writer"); FileOutputFormat.setOutputPath(job, outDir); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(IdentityReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1*1024*1024*1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong("test.randomwrite.total_bytes", numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite); } job.setNumMapTasks(numMaps); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) /1000 + " seconds."); return 0; }
Example 20
Source File: RandomWriter.java From RDFS with Apache License 2.0 | 4 votes |
/** * This is the main routine for launching a distributed random write job. * It runs 10 maps/node and each node writes 1 gig of data to a DFS file. * The reduce doesn't do anything. * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: writer <out-dir>"); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path outDir = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(RandomWriter.class); job.setJobName("random-writer"); FileOutputFormat.setOutputPath(job, outDir); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormat(RandomInputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(IdentityReducer.class); job.setOutputFormat(SequenceFileOutputFormat.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = job.getInt("test.randomwriter.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomwrite.bytes_per_map", 1*1024*1024*1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong("test.randomwrite.total_bytes", numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomwrite.bytes_per_map", totalBytesToWrite); } job.setNumMapTasks(numMaps); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) /1000 + " seconds."); return 0; }