Java Code Examples for org.apache.hadoop.mapred.JobConf#getLong()
The following examples show how to use
org.apache.hadoop.mapred.JobConf#getLong() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GenWriterThread.java From RDFS with Apache License 2.0 | 6 votes |
/** * Create a number of threads to generate write traffics * @param conf * @param key name of the mapper * @param value location of data input * @return * @throws IOException */ @Override public GenThread[] prepare(JobConf conf, Text key, Text value) throws IOException { this.rtc = new GenWriterRunTimeConstants(); super.prepare(conf, key, value, rtc); rtc.task_name = key.toString() + rtc.taskID; rtc.roll_interval = conf.getLong(WRITER_ROLL_INTERVAL_KEY, DEFAULT_ROLL_INTERVAL_SEC) * 1000; rtc.sync_interval = conf.getLong(WRITER_SYNC_INTERVAL_KEY, DEFAULT_SYNC_INTERVAL_SEC) * 1000; rtc.max_time = conf.getLong(MAX_TIME_SEC_KEY, DEFAULT_MAX_TIME_SEC) * 1000; rtc.data_rate = conf.getLong(WRITER_DATARATE_KEY, DEFAULT_DATA_RATE) * 1024; rtc.input = value.toString(); LOG.info("data rate: " + rtc.data_rate); GenWriterThread[] threads = new GenWriterThread[(int)rtc.nthreads]; for (int i=0; i<rtc.nthreads; i++) { threads[i] = new GenWriterThread(conf, new Path(new Path(rtc.input, rtc.task_name), rtc.task_name + "_" + i), rtc.task_name, i, rtc); } return threads; }
Example 2
Source File: RandomWriter.java From hadoop-gpu with Apache License 2.0 | 5 votes |
/** * Save the values out of the configuaration that we need to write * the data. */ @Override public void configure(JobConf job) { numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map", 1*1024*1024*1024); minKeySize = job.getInt("test.randomwrite.min_key", 10); keySizeRange = job.getInt("test.randomwrite.max_key", 1000) - minKeySize; minValueSize = job.getInt("test.randomwrite.min_value", 0); valueSizeRange = job.getInt("test.randomwrite.max_value", 20000) - minValueSize; }
Example 3
Source File: DynamoDBSplitGenerator.java From emr-dynamodb-connector with Apache License 2.0 | 5 votes |
public InputSplit[] generateSplits(int maxClusterMapTasks, int numSegments, JobConf conf) { log.info("Generating " + numSegments + " segments for " + maxClusterMapTasks + " max mappers"); int numMappers = Math.min(maxClusterMapTasks, numSegments); List<List<Integer>> segmentsPerSplit = new ArrayList<List<Integer>>(numMappers); for (int i = 0; i < numMappers; i++) { segmentsPerSplit.add(new ArrayList<Integer>()); } // Round-robin which split gets which segment id int mapper = 0; for (int i = 0; i < numSegments; i++) { segmentsPerSplit.get(mapper).add(i); mapper = (mapper + 1) % numMappers; } long approxItemCountPerSplit = conf.getLong(DynamoDBConstants.ITEM_COUNT, 0) / ((long) numMappers); InputSplit[] splits = new InputSplit[numMappers]; for (int i = 0; i < numMappers; i++) { log.info("Assigning " + segmentsPerSplit.get(i).size() + " segments to mapper " + i + ": " + segmentsPerSplit.get(i)); splits[i] = createDynamoDBSplit(getInputPath(conf), approxItemCountPerSplit, i, segmentsPerSplit.get(i), numSegments); } return splits; }
Example 4
Source File: ShuffleSchedulerImpl.java From hadoop with Apache License 2.0 | 5 votes |
public ShuffleSchedulerImpl(JobConf job, TaskStatus status, TaskAttemptID reduceId, ExceptionReporter reporter, Progress progress, Counters.Counter shuffledMapsCounter, Counters.Counter reduceShuffleBytes, Counters.Counter failedShuffleCounter) { totalMaps = job.getNumMapTasks(); abortFailureLimit = Math.max(30, totalMaps / 10); copyTimeTracker = new CopyTimeTracker(); remainingMaps = totalMaps; finishedMaps = new boolean[remainingMaps]; this.reporter = reporter; this.status = status; this.reduceId = reduceId; this.progress = progress; this.shuffledMapsCounter = shuffledMapsCounter; this.reduceShuffleBytes = reduceShuffleBytes; this.failedShuffleCounter = failedShuffleCounter; this.startTime = Time.monotonicNow(); lastProgressTime = startTime; referee.start(); this.maxFailedUniqueFetches = Math.min(totalMaps, 5); this.maxFetchFailuresBeforeReporting = job.getInt( MRJobConfig.SHUFFLE_FETCH_FAILURES, REPORT_FAILURE_LIMIT); this.reportReadErrorImmediately = job.getBoolean( MRJobConfig.SHUFFLE_NOTIFY_READERROR, true); this.maxDelay = job.getLong(MRJobConfig.MAX_SHUFFLE_FETCH_RETRY_DELAY, MRJobConfig.DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY); this.maxHostFailures = job.getInt( MRJobConfig.MAX_SHUFFLE_FETCH_HOST_FAILURES, MRJobConfig.DEFAULT_MAX_SHUFFLE_FETCH_HOST_FAILURES); }
Example 5
Source File: NodeDumper.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
/** * Configures the job, sets the flag for type of content and the topN number * if any. */ public void configure(JobConf conf) { this.conf = conf; this.inlinks = conf.getBoolean("inlinks", false); this.outlinks = conf.getBoolean("outlinks", false); this.scores = conf.getBoolean("scores", true); this.topn = conf.getLong("topn", Long.MAX_VALUE); }
Example 6
Source File: HadoopArchives.java From hadoop with Apache License 2.0 | 5 votes |
public void configure(JobConf conf) { this.conf = conf; // this is tightly tied to map reduce // since it does not expose an api // to get the partition partId = conf.getInt(MRJobConfig.TASK_PARTITION, -1); // create a file name using the partition // we need to write to this directory tmpOutputDir = FileOutputFormat.getWorkOutputPath(conf); blockSize = conf.getLong(HAR_BLOCKSIZE_LABEL, blockSize); // get the output path and write to the tmp // directory partname = "part-" + partId; tmpOutput = new Path(tmpOutputDir, partname); rootPath = (conf.get(SRC_PARENT_LABEL, null) == null) ? null : new Path(conf.get(SRC_PARENT_LABEL)); if (rootPath == null) { throw new RuntimeException("Unable to read parent " + "path for har from config"); } try { destFs = tmpOutput.getFileSystem(conf); //this was a stale copy if (destFs.exists(tmpOutput)) { destFs.delete(tmpOutput, false); } partStream = destFs.create(tmpOutput, false, conf.getInt("io.file.buffer.size", 4096), destFs.getDefaultReplication(tmpOutput), blockSize); } catch(IOException ie) { throw new RuntimeException("Unable to open output file " + tmpOutput, ie); } buffer = new byte[buf_size]; }
Example 7
Source File: RandomTextWriter.java From RDFS with Apache License 2.0 | 5 votes |
/** * Save the configuration value that we need to write the data. */ public void configure(JobConf job) { numBytesToWrite = job.getLong("test.randomtextwrite.bytes_per_map", 1*1024*1024*1024); minWordsInKey = job.getInt("test.randomtextwrite.min_words_key", 5); wordsInKeyRange = (job.getInt("test.randomtextwrite.max_words_key", 10) - minWordsInKey); minWordsInValue = job.getInt("test.randomtextwrite.min_words_value", 10); wordsInValueRange = (job.getInt("test.randomtextwrite.max_words_value", 100) - minWordsInValue); }
Example 8
Source File: DistCpV1.java From hadoop with Apache License 2.0 | 5 votes |
/** * Calculate how many maps to run. * Number of maps is bounded by a minimum of the cumulative size of the * copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the * command line) and at most (distcp.max.map.tasks, default * MAX_MAPS_PER_NODE * nodes in the cluster). * @param totalBytes Count of total bytes for job * @param job The job to configure * @return Count of maps to run. */ private static int setMapCount(long totalBytes, JobConf job) throws IOException { int numMaps = (int)(totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP)); numMaps = Math.min(numMaps, job.getInt(MAX_MAPS_LABEL, MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers())); numMaps = Math.max(numMaps, 1); job.setNumMapTasks(numMaps); return numMaps; }
Example 9
Source File: AbstractDynamoDBInputFormat.java From emr-dynamodb-connector with Apache License 2.0 | 5 votes |
@Override public InputSplit[] getSplits(JobConf conf, int desiredSplits) throws IOException { JobClient jobClient = new JobClient(conf); int maxClusterMapTasks = DynamoDBUtil.calcMaxMapTasks(jobClient); if (maxClusterMapTasks < 1) { throw new RuntimeException("Number of map tasks configured for the cluster less than 1. Map" + " tasks: " + maxClusterMapTasks); } double readPercentage = Double.parseDouble(conf.get(DynamoDBConstants .THROUGHPUT_READ_PERCENT, DynamoDBConstants.DEFAULT_THROUGHPUT_PERCENTAGE)); if (readPercentage <= 0) { throw new RuntimeException("Invalid read percentage: " + readPercentage); } log.info("Read percentage: " + readPercentage); double maxReadThroughputAllocated = ((double) conf.getLong(DynamoDBConstants.READ_THROUGHPUT, 1)); double maxWriteThroughputAllocated = ((double) conf.getLong(DynamoDBConstants .WRITE_THROUGHPUT, 1)); if (maxReadThroughputAllocated < 1.0) { throw new RuntimeException("Read throughput should not be less than 1. Read throughput " + "percent: " + maxReadThroughputAllocated); } int configuredReadThroughput = (int) Math.floor(maxReadThroughputAllocated * readPercentage); if (configuredReadThroughput < 1) { configuredReadThroughput = 1; } long tableSizeBytes = conf.getLong(DynamoDBConstants.TABLE_SIZE_BYTES, 1); int numSegments = getNumSegments((int) maxReadThroughputAllocated, (int) maxWriteThroughputAllocated, tableSizeBytes, conf); int numMappers = getNumMappers(maxClusterMapTasks, configuredReadThroughput, conf); log.info("Using " + numSegments + " segments across " + numMappers + " mappers"); return getSplitGenerator().generateSplits(numMappers, numSegments, conf); }
Example 10
Source File: SleepJob.java From RDFS with Apache License 2.0 | 5 votes |
public void configure(JobConf job) { this.mapSleepCount = job.getInt("sleep.job.map.sleep.count", mapSleepCount); this.reduceSleepCount = job.getInt("sleep.job.reduce.sleep.count", reduceSleepCount); this.mapSleepDuration = job.getLong("sleep.job.map.sleep.time" , 100) / mapSleepCount; this.reduceSleepDuration = job.getLong("sleep.job.reduce.sleep.time" , 100) / reduceSleepCount; this.countersPerTask = job.getInt("sleep.job.counters.per.task", 0); makeSomeTasksSlower(job); }
Example 11
Source File: RandomWriter.java From hadoop-book with Apache License 2.0 | 5 votes |
/** * Save the values out of the configuaration that we need to write the * data. */ @Override public void configure(JobConf job) { numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map", 1 * 1024 * 1024 * 1024); minKeySize = job.getInt("test.randomwrite.min_key", 10); keySizeRange = job.getInt("test.randomwrite.max_key", 1000) - minKeySize; minValueSize = job.getInt("test.randomwrite.min_value", 0); valueSizeRange = job.getInt("test.randomwrite.max_value", 20000) - minValueSize; }
Example 12
Source File: DataJoinReducerBase.java From big-c with Apache License 2.0 | 4 votes |
public void configure(JobConf job) { super.configure(job); this.job = job; this.maxNumOfValuesPerGroup = job.getLong("datajoin.maxNumOfValuesPerGroup", 100); }
Example 13
Source File: CrawlDbReader.java From anthelion with Apache License 2.0 | 4 votes |
public void configure(JobConf job) { topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks(); }
Example 14
Source File: PipeMapRed.java From RDFS with Apache License 2.0 | 4 votes |
public void configure(JobConf job) { try { String argv = getPipeCommand(job); joinDelay_ = job.getLong("stream.joindelay.milli", 0); job_ = job; fs_ = FileSystem.get(job_); String mapOutputFieldSeparator = job_.get("stream.map.output.field.separator", "\t"); String reduceOutputFieldSeparator = job_.get("stream.reduce.output.field.separator", "\t"); this.mapOutputFieldSeparator = mapOutputFieldSeparator.charAt(0); this.reduceOutFieldSeparator = reduceOutputFieldSeparator.charAt(0); this.numOfMapOutputKeyFields = job_.getInt("stream.num.map.output.key.fields", 1); this.numOfReduceOutputKeyFields = job_.getInt("stream.num.reduce.output.key.fields", 1); maxErrorBytes = job.getLong("stream.error.maxbytes", 100000); doPipe_ = getDoPipe(); if (!doPipe_) return; setStreamJobDetails(job); String[] argvSplit = splitArgs(argv); String prog = argvSplit[0]; File currentDir = new File(".").getAbsoluteFile(); File jobCacheDir = new File(currentDir.getParentFile().getParent(), "work"); if (new File(prog).isAbsolute()) { // we don't own it. Hope it is executable } else { FileUtil.chmod(new File(jobCacheDir, prog).toString(), "a+x"); } // // argvSplit[0]: // An absolute path should be a preexisting valid path on all TaskTrackers // A relative path is converted into an absolute pathname by looking // up the PATH env variable. If it still fails, look it up in the // tasktracker's local working directory // if (!new File(argvSplit[0]).isAbsolute()) { PathFinder finder = new PathFinder("PATH"); finder.prependPathComponent(jobCacheDir.toString()); File f = finder.getAbsolutePath(argvSplit[0]); if (f != null) { argvSplit[0] = f.getAbsolutePath(); } f = null; } // Wrap the stream program in a wrapper that allows admins to control // streaming job environment String wrapper = job.get("stream.wrapper"); if(wrapper != null) { String [] wrapComponents = splitArgs(wrapper); int totallength = wrapComponents.length + argvSplit.length; String [] finalArgv = new String [totallength]; for(int i=0; i<wrapComponents.length; i++) { finalArgv[i] = wrapComponents[i]; } for(int i=0; i<argvSplit.length; i++) { finalArgv[wrapComponents.length+i] = argvSplit[i]; } argvSplit = finalArgv; } logprintln("PipeMapRed exec " + Arrays.asList(argvSplit)); Environment childEnv = (Environment) StreamUtil.env().clone(); addJobConfToEnvironment(job_, childEnv); addEnvironment(childEnv, job_.get("stream.addenvironment")); sim = Runtime.getRuntime().exec(argvSplit, childEnv.toArray()); /* // This way required jdk1.5 Builder processBuilder = new ProcessBuilder(argvSplit); Map<String, String> env = processBuilder.environment(); addEnvironment(env, job_.get("stream.addenvironment")); sim = processBuilder.start(); */ clientOut_ = new DataOutputStream(new BufferedOutputStream(sim.getOutputStream())); clientIn_ = new DataInputStream(new BufferedInputStream(sim.getInputStream())); clientErr_ = new DataInputStream(new BufferedInputStream(sim.getErrorStream())); startTime_ = System.currentTimeMillis(); } catch (Exception e) { logStackTrace(e); LOG.error("configuration exception", e); throw new RuntimeException("configuration exception", e); } }
Example 15
Source File: RandomTextWriter.java From RDFS with Apache License 2.0 | 4 votes |
/** * This is the main routine for launching a distributed random write job. * It runs 10 maps/node and each node writes 1 gig of data to a DFS file. * The reduce doesn't do anything. * * @throws IOException */ public int run(String[] args) throws Exception { if (args.length == 0) { return printUsage(); } JobConf job = new JobConf(getConf()); job.setJarByClass(RandomTextWriter.class); job.setJobName("random-text-writer"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(RandomWriter.RandomInputFormat.class); job.setMapperClass(Map.class); JobClient client = new JobClient(job); ClusterStatus cluster = client.getClusterStatus(); int numMapsPerHost = job.getInt("test.randomtextwrite.maps_per_host", 10); long numBytesToWritePerMap = job.getLong("test.randomtextwrite.bytes_per_map", 1*1024*1024*1024); if (numBytesToWritePerMap == 0) { System.err.println("Cannot have test.randomtextwrite.bytes_per_map set to 0"); return -2; } long totalBytesToWrite = job.getLong("test.randomtextwrite.total_bytes", numMapsPerHost*numBytesToWritePerMap*cluster.getTaskTrackers()); int numMaps = (int) (totalBytesToWrite / numBytesToWritePerMap); if (numMaps == 0 && totalBytesToWrite > 0) { numMaps = 1; job.setLong("test.randomtextwrite.bytes_per_map", totalBytesToWrite); } Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; List<String> otherArgs = new ArrayList<String>(); for(int i=0; i < args.length; ++i) { try { if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else { otherArgs.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i-1]); return printUsage(); // exits } } job.setOutputFormat(outputFormatClass); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(0))); job.setNumMapTasks(numMaps); System.out.println("Running " + numMaps + " maps."); // reducer NONE job.setNumReduceTasks(0); Date startTime = new Date(); System.out.println("Job started: " + startTime); JobClient.runJob(job); Date endTime = new Date(); System.out.println("Job ended: " + endTime); System.out.println("The job took " + (endTime.getTime() - startTime.getTime()) /1000 + " seconds."); return 0; }
Example 16
Source File: CrawlDbReader.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
public void configure(JobConf job) { long lmin = job.getLong("db.reader.topn.min", 0); if (lmin != 0) { min = (float)lmin / 1000000.0f; } }
Example 17
Source File: DistCp.java From hadoop-gpu with Apache License 2.0 | 4 votes |
/** * Produce splits such that each is no greater than the quotient of the * total size and the number of splits requested. * @param job The handle to the JobConf object * @param numSplits Number of splits requested */ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { int cnfiles = job.getInt(SRC_COUNT_LABEL, -1); long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1); String srcfilelist = job.get(SRC_LIST_LABEL, ""); if (cnfiles < 0 || cbsize < 0 || "".equals(srcfilelist)) { throw new RuntimeException("Invalid metadata: #files(" + cnfiles + ") total_size(" + cbsize + ") listuri(" + srcfilelist + ")"); } Path src = new Path(srcfilelist); FileSystem fs = src.getFileSystem(job); FileStatus srcst = fs.getFileStatus(src); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); LongWritable key = new LongWritable(); FilePair value = new FilePair(); final long targetsize = cbsize / numSplits; long pos = 0L; long last = 0L; long acc = 0L; long cbrem = srcst.getLen(); SequenceFile.Reader sl = null; try { sl = new SequenceFile.Reader(fs, src, job); for (; sl.next(key, value); last = sl.getPosition()) { // if adding this split would put this split past the target size, // cut the last split and put this next file in the next split. if (acc + key.get() > targetsize && acc != 0) { long splitsize = last - pos; splits.add(new FileSplit(src, pos, splitsize, (String[])null)); cbrem -= splitsize; pos = last; acc = 0L; } acc += key.get(); } } finally { checkAndClose(sl); } if (cbrem != 0) { splits.add(new FileSplit(src, pos, cbrem, (String[])null)); } return splits.toArray(new FileSplit[splits.size()]); }
Example 18
Source File: TestDFSIO.java From big-c with Apache License 2.0 | 4 votes |
@Override // Mapper public void configure(JobConf conf) { super.configure(conf); skipSize = conf.getLong("test.io.skip.size", 0); }
Example 19
Source File: PipeMapRed.java From hadoop-gpu with Apache License 2.0 | 4 votes |
public void configure(JobConf job) { try { String argv = getPipeCommand(job); joinDelay_ = job.getLong("stream.joindelay.milli", 0); job_ = job; fs_ = FileSystem.get(job_); nonZeroExitIsFailure_ = job_.getBoolean("stream.non.zero.exit.is.failure", true); doPipe_ = getDoPipe(); if (!doPipe_) return; setStreamJobDetails(job); String[] argvSplit = splitArgs(argv); String prog = argvSplit[0]; File currentDir = new File(".").getAbsoluteFile(); if (new File(prog).isAbsolute()) { // we don't own it. Hope it is executable } else { FileUtil.chmod(new File(currentDir, prog).toString(), "a+x"); } // // argvSplit[0]: // An absolute path should be a preexisting valid path on all TaskTrackers // A relative path is converted into an absolute pathname by looking // up the PATH env variable. If it still fails, look it up in the // tasktracker's local working directory // if (!new File(argvSplit[0]).isAbsolute()) { PathFinder finder = new PathFinder("PATH"); finder.prependPathComponent(currentDir.toString()); File f = finder.getAbsolutePath(argvSplit[0]); if (f != null) { argvSplit[0] = f.getAbsolutePath(); } f = null; } logprintln("PipeMapRed exec " + Arrays.asList(argvSplit)); Environment childEnv = (Environment) StreamUtil.env().clone(); addJobConfToEnvironment(job_, childEnv); addEnvironment(childEnv, job_.get("stream.addenvironment")); // add TMPDIR environment variable with the value of java.io.tmpdir envPut(childEnv, "TMPDIR", System.getProperty("java.io.tmpdir")); // Start the process ProcessBuilder builder = new ProcessBuilder(argvSplit); builder.environment().putAll(childEnv.toMap()); sim = builder.start(); clientOut_ = new DataOutputStream(new BufferedOutputStream( sim.getOutputStream(), BUFFER_SIZE)); clientIn_ = new DataInputStream(new BufferedInputStream( sim.getInputStream(), BUFFER_SIZE)); clientErr_ = new DataInputStream(new BufferedInputStream(sim.getErrorStream())); startTime_ = System.currentTimeMillis(); errThread_ = new MRErrorThread(); errThread_.start(); } catch (Exception e) { logStackTrace(e); LOG.error("configuration exception", e); throw new RuntimeException("configuration exception", e); } }
Example 20
Source File: TeraGen.java From hadoop-gpu with Apache License 2.0 | 4 votes |
static long getNumberOfRows(JobConf job) { return job.getLong("terasort.num-rows", 0); }