Java Code Examples for org.apache.hadoop.mapred.JobConf#getNumReduceTasks()
The following examples show how to use
org.apache.hadoop.mapred.JobConf#getNumReduceTasks() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ZephyrOutputFormat.java From zephyr with Apache License 2.0 | 6 votes |
@Override public void checkOutputSpecs(FileSystem ignored, JobConf job) throws FileAlreadyExistsException, InvalidJobConfException, IOException { // Ensure that the output directory is set and not already there Path outDir = getOutputPath(job); if (outDir == null && job.getNumReduceTasks() != 0) { throw new InvalidJobConfException("Output directory not set in JobConf."); } if (outDir != null) { FileSystem fs = outDir.getFileSystem(job); // normalize the output directory outDir = fs.makeQualified(outDir); setOutputPath(job, outDir); // get delegation token for the outDir's file system TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[]{outDir}, job); String jobUuid = job.get("zephyr.job.uuid"); if (jobUuid == null) throw new InvalidJobConfException("This output format REQUIRES the value zephyr.job.uuid to be specified in the job configuration!"); // // check its existence // if (fs.exists(outDir)) { // throw new FileAlreadyExistsException("Output directory " + outDir // + " already exists"); // } } }
Example 2
Source File: TableMapReduceUtil.java From hbase with Apache License 2.0 | 6 votes |
/** * Use this before submitting a TableReduce job. It will * appropriately set up the JobConf. * * @param table The output table. * @param reducer The reducer class to use. * @param job The current job configuration to adjust. * @param partitioner Partitioner to use. Pass <code>null</code> to use * default partitioner. * @param addDependencyJars upload HBase jars and jars for any of the configured * job classes via the distributed cache (tmpjars). * @throws IOException When determining the region count fails. */ public static void initTableReduceJob(String table, Class<? extends TableReduce> reducer, JobConf job, Class partitioner, boolean addDependencyJars) throws IOException { job.setOutputFormat(TableOutputFormat.class); job.setReducerClass(reducer); job.set(TableOutputFormat.OUTPUT_TABLE, table); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Put.class); job.setStrings("io.serializations", job.get("io.serializations"), MutationSerialization.class.getName(), ResultSerialization.class.getName()); if (partitioner == HRegionPartitioner.class) { job.setPartitionerClass(HRegionPartitioner.class); int regions = MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table)); if (job.getNumReduceTasks() > regions) { job.setNumReduceTasks(regions); } } else if (partitioner != null) { job.setPartitionerClass(partitioner); } if (addDependencyJars) { addDependencyJars(job); } initCredentials(job); }
Example 3
Source File: TotalOrderPartitioner.java From RDFS with Apache License 2.0 | 5 votes |
/** * Read in the partition file and build indexing data structures. * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and * <tt>total.order.partitioner.natural.order</tt> is not false, a trie * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes * will be built. Otherwise, keys will be located using a binary search of * the partition keyset using the {@link org.apache.hadoop.io.RawComparator} * defined for this job. The input file must be sorted with the same * comparator and contain {@link org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys. */ @SuppressWarnings("unchecked") // keytype from conf not static public void configure(JobConf job) { try { String parts = getPartitionFile(job); final Path partFile = new Path(parts); final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(job) // assume in DistributedCache : partFile.getFileSystem(job); Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass(); K[] splitPoints = readPartitions(fs, partFile, keyClass, job); if (splitPoints.length != job.getNumReduceTasks() - 1) { throw new IOException("Wrong number of partitions in keyset"); } RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator(); for (int i = 0; i < splitPoints.length - 1; ++i) { if (comparator.compare(splitPoints[i], splitPoints[i+1]) >= 0) { throw new IOException("Split points are out of order"); } } boolean natOrder = job.getBoolean("total.order.partitioner.natural.order", true); if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) { partitions = buildTrie((BinaryComparable[])splitPoints, 0, splitPoints.length, new byte[0], job.getInt("total.order.partitioner.max.trie.depth", 2)); } else { partitions = new BinarySearchNode(splitPoints, comparator); } } catch (IOException e) { throw new IllegalArgumentException("Can't read partitions file", e); } }
Example 4
Source File: InputSampler.java From hadoop-gpu with Apache License 2.0 | 5 votes |
/** * Write a partition file for the given job, using the Sampler provided. * Queries the sampler for a sample keyset, sorts by the output key * comparator, selects the keys for each rank, and writes to the destination * returned from {@link org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}. */ @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K,V> void writePartitionFile(JobConf job, Sampler<K,V> sampler) throws IOException { final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat(); int numPartitions = job.getNumReduceTasks(); K[] samples = sampler.getSample(inf, job); LOG.info("Using " + samples.length + " samples"); RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job)); FileSystem fs = dst.getFileSystem(job); if (fs.exists(dst)) { fs.delete(dst, false); } SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; int last = -1; for(int i = 1; i < numPartitions; ++i) { int k = Math.round(stepSize * i); while (last >= k && comparator.compare(samples[last], samples[k]) == 0) { ++k; } writer.append(samples[k], nullValue); last = k; } writer.close(); }
Example 5
Source File: TotalOrderPartitioner.java From hadoop-gpu with Apache License 2.0 | 5 votes |
/** * Read in the partition file and build indexing data structures. * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and * <tt>total.order.partitioner.natural.order</tt> is not false, a trie * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes * will be built. Otherwise, keys will be located using a binary search of * the partition keyset using the {@link org.apache.hadoop.io.RawComparator} * defined for this job. The input file must be sorted with the same * comparator and contain {@link org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys. */ @SuppressWarnings("unchecked") // keytype from conf not static public void configure(JobConf job) { try { String parts = getPartitionFile(job); final Path partFile = new Path(parts); final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(job) // assume in DistributedCache : partFile.getFileSystem(job); Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass(); K[] splitPoints = readPartitions(fs, partFile, keyClass, job); if (splitPoints.length != job.getNumReduceTasks() - 1) { throw new IOException("Wrong number of partitions in keyset"); } RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator(); for (int i = 0; i < splitPoints.length - 1; ++i) { if (comparator.compare(splitPoints[i], splitPoints[i+1]) >= 0) { throw new IOException("Split points are out of order"); } } boolean natOrder = job.getBoolean("total.order.partitioner.natural.order", true); if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) { partitions = buildTrie((BinaryComparable[])splitPoints, 0, splitPoints.length, new byte[0], job.getInt("total.order.partitioner.max.trie.depth", 2)); } else { partitions = new BinarySearchNode(splitPoints, comparator); } } catch (IOException e) { throw new IllegalArgumentException("Can't read partitions file", e); } }
Example 6
Source File: TeraInputFormat.java From hadoop-gpu with Apache License 2.0 | 5 votes |
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param conf the job to sample * @param partFile where to write the output file to * @throws IOException if something goes wrong */ public static void writePartitionFile(JobConf conf, Path partFile) throws IOException { TeraInputFormat inFormat = new TeraInputFormat(); TextSampler sampler = new TextSampler(); Text key = new Text(); Text value = new Text(); int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; long records = 0; // take N samples from different parts of the input for(int i=0; i < samples; ++i) { RecordReader<Text,Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null); while (reader.next(key, value)) { sampler.addKey(key); records += 1; if ((i+1) * recordsPerSample <= records) { break; } } } FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); for(Text split : sampler.createPartitions(partitions)) { writer.append(split, nullValue); } writer.close(); }
Example 7
Source File: TeraInputFormat.java From hadoop-book with Apache License 2.0 | 5 votes |
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * * @param conf the job to sample * @param partFile where to write the output file to * @throws IOException if something goes wrong */ public static void writePartitionFile(JobConf conf, Path partFile) throws IOException { TeraInputFormat inFormat = new TeraInputFormat(); TextSampler sampler = new TextSampler(); Text key = new Text(); Text value = new Text(); int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; long records = 0; // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { RecordReader<Text, Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null); while (reader.next(key, value)) { sampler.addKey(key); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); for (Text split : sampler.createPartitions(partitions)) { writer.append(split, nullValue); } writer.close(); }
Example 8
Source File: FetcherOutputFormat.java From nutch-htmlunit with Apache License 2.0 | 5 votes |
public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException { Path out = FileOutputFormat.getOutputPath(job); if ((out == null) && (job.getNumReduceTasks() != 0)) { throw new InvalidJobConfException( "Output directory not set in JobConf."); } if (fs == null) { fs = out.getFileSystem(job); } if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME))) throw new IOException("Segment already fetched!"); }
Example 9
Source File: InputSampler.java From RDFS with Apache License 2.0 | 5 votes |
/** * Write a partition file for the given job, using the Sampler provided. * Queries the sampler for a sample keyset, sorts by the output key * comparator, selects the keys for each rank, and writes to the destination * returned from {@link org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}. */ @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K,V> void writePartitionFile(JobConf job, Sampler<K,V> sampler) throws IOException { final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat(); int numPartitions = job.getNumReduceTasks(); K[] samples = sampler.getSample(inf, job); LOG.info("Using " + samples.length + " samples"); RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job)); FileSystem fs = dst.getFileSystem(job); if (fs.exists(dst)) { fs.delete(dst, false); } SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; int last = -1; for(int i = 1; i < numPartitions; ++i) { int k = Math.round(stepSize * i); while (last >= k && comparator.compare(samples[last], samples[k]) == 0) { ++k; } writer.append(samples[k], nullValue); last = k; } writer.close(); }
Example 10
Source File: JobSubmitter.java From hadoop with Apache License 2.0 | 5 votes |
private void checkSpecs(Job job) throws ClassNotFoundException, InterruptedException, IOException { JobConf jConf = (JobConf)job.getConfiguration(); // Check the output specification if (jConf.getNumReduceTasks() == 0 ? jConf.getUseNewMapper() : jConf.getUseNewReducer()) { org.apache.hadoop.mapreduce.OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), job.getConfiguration()); output.checkOutputSpecs(job); } else { jConf.getOutputFormat().checkOutputSpecs(jtFs, jConf); } }
Example 11
Source File: TeraInputFormat.java From RDFS with Apache License 2.0 | 5 votes |
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param conf the job to sample * @param partFile where to write the output file to * @throws IOException if something goes wrong */ public static void writePartitionFile(JobConf conf, Path partFile) throws IOException { TeraInputFormat inFormat = new TeraInputFormat(); TextSampler sampler = new TextSampler(); Text key = new Text(); Text value = new Text(); int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; long records = 0; // take N samples from different parts of the input for(int i=0; i < samples; ++i) { RecordReader<Text,Text> reader = inFormat.getRecordReader(splits[sampleStep * i], conf, null); while (reader.next(key, value)) { sampler.addKey(key); records += 1; if ((i+1) * recordsPerSample <= records) { break; } } } FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); for(Text split : sampler.createPartitions(partitions)) { writer.append(split, nullValue); } writer.close(); }
Example 12
Source File: SleepJob.java From RDFS with Apache License 2.0 | 5 votes |
public RecordReader<IntWritable,IntWritable> getRecordReader( InputSplit ignored, JobConf conf, Reporter reporter) throws IOException { final int count = conf.getInt("sleep.job.map.sleep.count", 1); if (count < 0) throw new IOException("Invalid map count: " + count); final int redcount = conf.getInt("sleep.job.reduce.sleep.count", 1); if (redcount < 0) throw new IOException("Invalid reduce count: " + redcount); final int emitPerMapTask = (redcount * conf.getNumReduceTasks()); return new RecordReader<IntWritable,IntWritable>() { private int records = 0; private int emitCount = 0; public boolean next(IntWritable key, IntWritable value) throws IOException { key.set(emitCount); int emit = emitPerMapTask / count; if ((emitPerMapTask) % count > records) { ++emit; } emitCount += emit; value.set(emit); return records++ < count; } public IntWritable createKey() { return new IntWritable(); } public IntWritable createValue() { return new IntWritable(); } public long getPos() throws IOException { return records; } public void close() throws IOException { } public float getProgress() throws IOException { return records / ((float)count); } }; }
Example 13
Source File: TableMapReduceUtil.java From hbase with Apache License 2.0 | 5 votes |
/** * Ensures that the given number of reduce tasks for the given job * configuration does not exceed the number of regions for the given table. * * @param table The table to get the region count for. * @param job The current job configuration to adjust. * @throws IOException When retrieving the table details fails. */ // Used by tests. public static void limitNumReduceTasks(String table, JobConf job) throws IOException { int regions = MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table)); if (job.getNumReduceTasks() > regions) job.setNumReduceTasks(regions); }
Example 14
Source File: FetcherOutputFormat.java From anthelion with Apache License 2.0 | 5 votes |
public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException { Path out = FileOutputFormat.getOutputPath(job); if ((out == null) && (job.getNumReduceTasks() != 0)) { throw new InvalidJobConfException( "Output directory not set in JobConf."); } if (fs == null) { fs = out.getFileSystem(job); } if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME))) throw new IOException("Segment already fetched!"); }
Example 15
Source File: JobSubmitter.java From big-c with Apache License 2.0 | 5 votes |
private void checkSpecs(Job job) throws ClassNotFoundException, InterruptedException, IOException { JobConf jConf = (JobConf)job.getConfiguration(); // Check the output specification if (jConf.getNumReduceTasks() == 0 ? jConf.getUseNewMapper() : jConf.getUseNewReducer()) { org.apache.hadoop.mapreduce.OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), job.getConfiguration()); output.checkOutputSpecs(job); } else { jConf.getOutputFormat().checkOutputSpecs(jtFs, jConf); } }
Example 16
Source File: SegmentMerger.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
public void configure(JobConf conf) { setConf(conf); if (sliceSize > 0) { sliceSize = sliceSize / conf.getNumReduceTasks(); } }
Example 17
Source File: CrawlDbReader.java From nutch-htmlunit with Apache License 2.0 | 4 votes |
public void configure(JobConf job) { topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks(); }
Example 18
Source File: CrawlDbReader.java From anthelion with Apache License 2.0 | 4 votes |
public void configure(JobConf job) { topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks(); }
Example 19
Source File: SegmentMerger.java From anthelion with Apache License 2.0 | 4 votes |
public void configure(JobConf conf) { setConf(conf); if (sliceSize > 0) { sliceSize = sliceSize / conf.getNumReduceTasks(); } }
Example 20
Source File: TaskCalculator.java From emr-dynamodb-connector with Apache License 2.0 | 4 votes |
public int getMaxMapTasks() throws IOException { JobConf conf = (JobConf) jobClient.getConf(); // Total number of nodes in the cluster int nodes = jobClient.getClusterStatus().getTaskTrackers(); log.info("Cluster has " + nodes + " active nodes."); if (nodes == 0) { log.warn("Cluster doesn't have any nodes"); return 0; } // Memory per slot int slotMemory = conf.getInt("yarn.scheduler.minimum-allocation-mb", 1024); // Default value // from yarn-default.xml // Number of slots in a core node int nodeMemory = nodeCapacityProvider.getCoreNodeMemoryMB(); int nodeSlots = nodeMemory / slotMemory; // Number of slots for a mapper int mapMemory = conf.getInt(MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB); int mapSlots = (int) Math.ceil((double) mapMemory / slotMemory); // Number of slots for an application master int amMemory = conf.getInt(MRJobConfig.MR_AM_VMEM_MB, MRJobConfig.DEFAULT_MR_AM_VMEM_MB); int appMasterSlots = (int) Math.ceil((double) amMemory / slotMemory); // Number of slots for a reducer int reduceMemory = conf.getInt(MRJobConfig.REDUCE_MEMORY_MB, MRJobConfig .DEFAULT_REDUCE_MEMORY_MB); int reduceSlots = (int) Math.ceil((double) reduceMemory / slotMemory); // Number of reducers int reducers = conf.getNumReduceTasks(); // Calculate the number of mappers int mappers = yarnContainerAllocator.getMaxMappers(nodes, reducers, nodeSlots, appMasterSlots, mapSlots, reduceSlots); log.info("Slot size: " + slotMemory + "MB."); log.info("Node manager can allocate " + nodeMemory + "MB (" + nodeSlots + " slots) for " + "containers on each node."); log.info("Each mapper needs: " + mapMemory + "MB. (" + mapSlots + " slots)"); log.info("Each reducer needs: " + reduceMemory + "MB. (" + reduceSlots + " slots)"); log.info("MapReduce Application Manager needs: " + amMemory + " MB. (" + appMasterSlots + " " + "slots)"); log.info("Number of reducers: " + reducers); log.info("Max number of cluster map tasks: " + mappers); if (mappers < 1) { log.warn("The calculated max number of concurrent map tasks is less than 1. Use 1 instead."); mappers = 1; } return mappers; }