org.apache.hadoop.mapred.JobConf#getNumReduceTasks

Source File: ZephyrOutputFormat.java From zephyr with Apache License 2.0

6 votes

@Override
public void checkOutputSpecs(FileSystem ignored, JobConf job) throws FileAlreadyExistsException, InvalidJobConfException, IOException {
    // Ensure that the output directory is set and not already there
    Path outDir = getOutputPath(job);
    if (outDir == null && job.getNumReduceTasks() != 0) {
        throw new InvalidJobConfException("Output directory not set in JobConf.");
    }
    if (outDir != null) {
        FileSystem fs = outDir.getFileSystem(job);
        // normalize the output directory
        outDir = fs.makeQualified(outDir);
        setOutputPath(job, outDir);

        // get delegation token for the outDir's file system
        TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[]{outDir}, job);
        String jobUuid = job.get("zephyr.job.uuid");
        if (jobUuid == null)
            throw new InvalidJobConfException("This output format REQUIRES the value zephyr.job.uuid to be specified in the job configuration!");
        // // check its existence
        // if (fs.exists(outDir)) {
        // throw new FileAlreadyExistsException("Output directory " + outDir
        // + " already exists");
        // }
    }
}

Source File: TableMapReduceUtil.java From hbase with Apache License 2.0

6 votes

/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf.
 *
 * @param table  The output table.
 * @param reducer  The reducer class to use.
 * @param job  The current job configuration to adjust.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use
 * default partitioner.
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When determining the region count fails.
 */
public static void initTableReduceJob(String table,
  Class<? extends TableReduce> reducer, JobConf job, Class partitioner,
  boolean addDependencyJars) throws IOException {
  job.setOutputFormat(TableOutputFormat.class);
  job.setReducerClass(reducer);
  job.set(TableOutputFormat.OUTPUT_TABLE, table);
  job.setOutputKeyClass(ImmutableBytesWritable.class);
  job.setOutputValueClass(Put.class);
  job.setStrings("io.serializations", job.get("io.serializations"),
      MutationSerialization.class.getName(), ResultSerialization.class.getName());
  if (partitioner == HRegionPartitioner.class) {
    job.setPartitionerClass(HRegionPartitioner.class);
    int regions =
      MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table));
    if (job.getNumReduceTasks() > regions) {
      job.setNumReduceTasks(regions);
    }
  } else if (partitioner != null) {
    job.setPartitionerClass(partitioner);
  }
  if (addDependencyJars) {
    addDependencyJars(job);
  }
  initCredentials(job);
}

Source File: TotalOrderPartitioner.java From RDFS with Apache License 2.0

5 votes

/**
 * Read in the partition file and build indexing data structures.
 * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and
 * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
 * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
 * will be built. Otherwise, keys will be located using a binary search of
 * the partition keyset using the {@link org.apache.hadoop.io.RawComparator}
 * defined for this job. The input file must be sorted with the same
 * comparator and contain {@link
   org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys.
 */
@SuppressWarnings("unchecked") // keytype from conf not static
public void configure(JobConf job) {
  try {
    String parts = getPartitionFile(job);
    final Path partFile = new Path(parts);
    final FileSystem fs = (DEFAULT_PATH.equals(parts))
      ? FileSystem.getLocal(job)     // assume in DistributedCache
      : partFile.getFileSystem(job);

    Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
    K[] splitPoints = readPartitions(fs, partFile, keyClass, job);
    if (splitPoints.length != job.getNumReduceTasks() - 1) {
      throw new IOException("Wrong number of partitions in keyset");
    }
    RawComparator<K> comparator =
      (RawComparator<K>) job.getOutputKeyComparator();
    for (int i = 0; i < splitPoints.length - 1; ++i) {
      if (comparator.compare(splitPoints[i], splitPoints[i+1]) >= 0) {
        throw new IOException("Split points are out of order");
      }
    }
    boolean natOrder =
      job.getBoolean("total.order.partitioner.natural.order", true);
    if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
      partitions = buildTrie((BinaryComparable[])splitPoints, 0,
          splitPoints.length, new byte[0],
          job.getInt("total.order.partitioner.max.trie.depth", 2));
    } else {
      partitions = new BinarySearchNode(splitPoints, comparator);
    }
  } catch (IOException e) {
    throw new IllegalArgumentException("Can't read partitions file", e);
  }
}

Source File: InputSampler.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link
   org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(JobConf job,
    Sampler<K,V> sampler) throws IOException {
  final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat();
  int numPartitions = job.getNumReduceTasks();
  K[] samples = sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getOutputKeyComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
  FileSystem fs = dst.getFileSystem(job);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst,
      job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}

Source File: TotalOrderPartitioner.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Read in the partition file and build indexing data structures.
 * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and
 * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
 * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
 * will be built. Otherwise, keys will be located using a binary search of
 * the partition keyset using the {@link org.apache.hadoop.io.RawComparator}
 * defined for this job. The input file must be sorted with the same
 * comparator and contain {@link
   org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys.
 */
@SuppressWarnings("unchecked") // keytype from conf not static
public void configure(JobConf job) {
  try {
    String parts = getPartitionFile(job);
    final Path partFile = new Path(parts);
    final FileSystem fs = (DEFAULT_PATH.equals(parts))
      ? FileSystem.getLocal(job)     // assume in DistributedCache
      : partFile.getFileSystem(job);

    Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass();
    K[] splitPoints = readPartitions(fs, partFile, keyClass, job);
    if (splitPoints.length != job.getNumReduceTasks() - 1) {
      throw new IOException("Wrong number of partitions in keyset");
    }
    RawComparator<K> comparator =
      (RawComparator<K>) job.getOutputKeyComparator();
    for (int i = 0; i < splitPoints.length - 1; ++i) {
      if (comparator.compare(splitPoints[i], splitPoints[i+1]) >= 0) {
        throw new IOException("Split points are out of order");
      }
    }
    boolean natOrder =
      job.getBoolean("total.order.partitioner.natural.order", true);
    if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
      partitions = buildTrie((BinaryComparable[])splitPoints, 0,
          splitPoints.length, new byte[0],
          job.getInt("total.order.partitioner.max.trie.depth", 2));
    } else {
      partitions = new BinarySearchNode(splitPoints, comparator);
    }
  } catch (IOException e) {
    throw new IllegalArgumentException("Can't read partitions file", e);
  }
}

Source File: TeraInputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(JobConf conf, 
                                      Path partFile) throws IOException {
  TeraInputFormat inFormat = new TeraInputFormat();
  TextSampler sampler = new TextSampler();
  Text key = new Text();
  Text value = new Text();
  int partitions = conf.getNumReduceTasks();
  long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
  InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
  int samples = Math.min(10, splits.length);
  long recordsPerSample = sampleSize / samples;
  int sampleStep = splits.length / samples;
  long records = 0;
  // take N samples from different parts of the input
  for(int i=0; i < samples; ++i) {
    RecordReader<Text,Text> reader = 
      inFormat.getRecordReader(splits[sampleStep * i], conf, null);
    while (reader.next(key, value)) {
      sampler.addKey(key);
      records += 1;
      if ((i+1) * recordsPerSample <= records) {
        break;
      }
    }
  }
  FileSystem outFs = partFile.getFileSystem(conf);
  if (outFs.exists(partFile)) {
    outFs.delete(partFile, false);
  }
  SequenceFile.Writer writer = 
    SequenceFile.createWriter(outFs, conf, partFile, Text.class, 
                              NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  for(Text split : sampler.createPartitions(partitions)) {
    writer.append(split, nullValue);
  }
  writer.close();
}

Source File: TeraInputFormat.java From hadoop-book with Apache License 2.0

5 votes

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 *
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(JobConf conf,
        Path partFile) throws IOException {
    TeraInputFormat inFormat = new TeraInputFormat();
    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = conf.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
    int samples = Math.min(10, splits.length);
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.length / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        RecordReader<Text, Text> reader =
                inFormat.getRecordReader(splits[sampleStep * i], conf, null);
        while (reader.next(key, value)) {
            sampler.addKey(key);
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile)) {
        outFs.delete(partFile, false);
    }
    SequenceFile.Writer writer =
            SequenceFile.createWriter(outFs, conf, partFile, Text.class,
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}

Source File: FetcherOutputFormat.java From nutch-htmlunit with Apache License 2.0

5 votes

public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
  Path out = FileOutputFormat.getOutputPath(job);
  if ((out == null) && (job.getNumReduceTasks() != 0)) {
  	throw new InvalidJobConfException(
  			"Output directory not set in JobConf.");
  }
  if (fs == null) {
  	fs = out.getFileSystem(job);
  }
  if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME)))
  	throw new IOException("Segment already fetched!");
}

Source File: InputSampler.java From RDFS with Apache License 2.0

5 votes

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link
   org.apache.hadoop.mapred.lib.TotalOrderPartitioner#getPartitionFile}.
 */
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K,V> void writePartitionFile(JobConf job,
    Sampler<K,V> sampler) throws IOException {
  final InputFormat<K,V> inf = (InputFormat<K,V>) job.getInputFormat();
  int numPartitions = job.getNumReduceTasks();
  K[] samples = sampler.getSample(inf, job);
  LOG.info("Using " + samples.length + " samples");
  RawComparator<K> comparator =
    (RawComparator<K>) job.getOutputKeyComparator();
  Arrays.sort(samples, comparator);
  Path dst = new Path(TotalOrderPartitioner.getPartitionFile(job));
  FileSystem fs = dst.getFileSystem(job);
  if (fs.exists(dst)) {
    fs.delete(dst, false);
  }
  SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, dst,
      job.getMapOutputKeyClass(), NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  float stepSize = samples.length / (float) numPartitions;
  int last = -1;
  for(int i = 1; i < numPartitions; ++i) {
    int k = Math.round(stepSize * i);
    while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
      ++k;
    }
    writer.append(samples[k], nullValue);
    last = k;
  }
  writer.close();
}

Source File: JobSubmitter.java From hadoop with Apache License 2.0

5 votes

private void checkSpecs(Job job) throws ClassNotFoundException, 
    InterruptedException, IOException {
  JobConf jConf = (JobConf)job.getConfiguration();
  // Check the output specification
  if (jConf.getNumReduceTasks() == 0 ? 
      jConf.getUseNewMapper() : jConf.getUseNewReducer()) {
    org.apache.hadoop.mapreduce.OutputFormat<?, ?> output =
      ReflectionUtils.newInstance(job.getOutputFormatClass(),
        job.getConfiguration());
    output.checkOutputSpecs(job);
  } else {
    jConf.getOutputFormat().checkOutputSpecs(jtFs, jConf);
  }
}

Source File: TeraInputFormat.java From RDFS with Apache License 2.0

5 votes

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(JobConf conf, 
                                      Path partFile) throws IOException {
  TeraInputFormat inFormat = new TeraInputFormat();
  TextSampler sampler = new TextSampler();
  Text key = new Text();
  Text value = new Text();
  int partitions = conf.getNumReduceTasks();
  long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
  InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
  int samples = Math.min(10, splits.length);
  long recordsPerSample = sampleSize / samples;
  int sampleStep = splits.length / samples;
  long records = 0;
  // take N samples from different parts of the input
  for(int i=0; i < samples; ++i) {
    RecordReader<Text,Text> reader = 
      inFormat.getRecordReader(splits[sampleStep * i], conf, null);
    while (reader.next(key, value)) {
      sampler.addKey(key);
      records += 1;
      if ((i+1) * recordsPerSample <= records) {
        break;
      }
    }
  }
  FileSystem outFs = partFile.getFileSystem(conf);
  if (outFs.exists(partFile)) {
    outFs.delete(partFile, false);
  }
  SequenceFile.Writer writer = 
    SequenceFile.createWriter(outFs, conf, partFile, Text.class, 
                              NullWritable.class);
  NullWritable nullValue = NullWritable.get();
  for(Text split : sampler.createPartitions(partitions)) {
    writer.append(split, nullValue);
  }
  writer.close();
}

Source File: SleepJob.java From RDFS with Apache License 2.0

5 votes

public RecordReader<IntWritable,IntWritable> getRecordReader(
    InputSplit ignored, JobConf conf, Reporter reporter)
    throws IOException {
  final int count = conf.getInt("sleep.job.map.sleep.count", 1);
  if (count < 0) throw new IOException("Invalid map count: " + count);
  final int redcount = conf.getInt("sleep.job.reduce.sleep.count", 1);
  if (redcount < 0)
    throw new IOException("Invalid reduce count: " + redcount);
  final int emitPerMapTask = (redcount * conf.getNumReduceTasks());
return new RecordReader<IntWritable,IntWritable>() {
    private int records = 0;
    private int emitCount = 0;
    
    public boolean next(IntWritable key, IntWritable value)
        throws IOException {
      key.set(emitCount);
      int emit = emitPerMapTask / count;
      if ((emitPerMapTask) % count > records) {
        ++emit;
      }
      emitCount += emit;
      value.set(emit);
      return records++ < count;
    }
    public IntWritable createKey() { return new IntWritable(); }
    public IntWritable createValue() { return new IntWritable(); }
    public long getPos() throws IOException { return records; }
    public void close() throws IOException { }
    public float getProgress() throws IOException {
      return records / ((float)count);
    }
  };
}

Source File: TableMapReduceUtil.java From hbase with Apache License 2.0

5 votes

/**
 * Ensures that the given number of reduce tasks for the given job
 * configuration does not exceed the number of regions for the given table.
 *
 * @param table  The table to get the region count for.
 * @param job  The current job configuration to adjust.
 * @throws IOException When retrieving the table details fails.
 */
// Used by tests.
public static void limitNumReduceTasks(String table, JobConf job)
throws IOException {
  int regions =
    MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table));
  if (job.getNumReduceTasks() > regions)
    job.setNumReduceTasks(regions);
}

Source File: FetcherOutputFormat.java From anthelion with Apache License 2.0

5 votes

public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
  Path out = FileOutputFormat.getOutputPath(job);
  if ((out == null) && (job.getNumReduceTasks() != 0)) {
  	throw new InvalidJobConfException(
  			"Output directory not set in JobConf.");
  }
  if (fs == null) {
  	fs = out.getFileSystem(job);
  }
  if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME)))
  	throw new IOException("Segment already fetched!");
}

Source File: JobSubmitter.java From big-c with Apache License 2.0

5 votes

private void checkSpecs(Job job) throws ClassNotFoundException, 
    InterruptedException, IOException {
  JobConf jConf = (JobConf)job.getConfiguration();
  // Check the output specification
  if (jConf.getNumReduceTasks() == 0 ? 
      jConf.getUseNewMapper() : jConf.getUseNewReducer()) {
    org.apache.hadoop.mapreduce.OutputFormat<?, ?> output =
      ReflectionUtils.newInstance(job.getOutputFormatClass(),
        job.getConfiguration());
    output.checkOutputSpecs(job);
  } else {
    jConf.getOutputFormat().checkOutputSpecs(jtFs, jConf);
  }
}

Source File: SegmentMerger.java From nutch-htmlunit with Apache License 2.0

4 votes

public void configure(JobConf conf) {
  setConf(conf);
  if (sliceSize > 0) {
    sliceSize = sliceSize / conf.getNumReduceTasks();
  }
}

Source File: CrawlDbReader.java From nutch-htmlunit with Apache License 2.0

4 votes

public void configure(JobConf job) {
  topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks();
}

Source File: CrawlDbReader.java From anthelion with Apache License 2.0

4 votes

public void configure(JobConf job) {
  topN = job.getLong("db.reader.topn", 100) / job.getNumReduceTasks();
}

Source File: SegmentMerger.java From anthelion with Apache License 2.0

4 votes

public void configure(JobConf conf) {
  setConf(conf);
  if (sliceSize > 0) {
    sliceSize = sliceSize / conf.getNumReduceTasks();
  }
}

Source File: TaskCalculator.java From emr-dynamodb-connector with Apache License 2.0

4 votes

public int getMaxMapTasks() throws IOException {
  JobConf conf = (JobConf) jobClient.getConf();

  // Total number of nodes in the cluster
  int nodes = jobClient.getClusterStatus().getTaskTrackers();
  log.info("Cluster has " + nodes + " active nodes.");
  if (nodes == 0) {
    log.warn("Cluster doesn't have any nodes");
    return 0;
  }

  // Memory per slot
  int slotMemory = conf.getInt("yarn.scheduler.minimum-allocation-mb", 1024); // Default value
  // from yarn-default.xml

  // Number of slots in a core node
  int nodeMemory = nodeCapacityProvider.getCoreNodeMemoryMB();
  int nodeSlots = nodeMemory / slotMemory;

  // Number of slots for a mapper
  int mapMemory = conf.getInt(MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB);
  int mapSlots = (int) Math.ceil((double) mapMemory / slotMemory);

  // Number of slots for an application master
  int amMemory = conf.getInt(MRJobConfig.MR_AM_VMEM_MB, MRJobConfig.DEFAULT_MR_AM_VMEM_MB);
  int appMasterSlots = (int) Math.ceil((double) amMemory / slotMemory);

  // Number of slots for a reducer
  int reduceMemory = conf.getInt(MRJobConfig.REDUCE_MEMORY_MB, MRJobConfig
      .DEFAULT_REDUCE_MEMORY_MB);
  int reduceSlots = (int) Math.ceil((double) reduceMemory / slotMemory);

  // Number of reducers
  int reducers = conf.getNumReduceTasks();

  // Calculate the number of mappers
  int mappers = yarnContainerAllocator.getMaxMappers(nodes, reducers, nodeSlots,
      appMasterSlots, mapSlots, reduceSlots);

  log.info("Slot size: " + slotMemory + "MB.");
  log.info("Node manager can allocate " + nodeMemory + "MB (" + nodeSlots + " slots) for "
      + "containers on each node.");
  log.info("Each mapper needs: " + mapMemory + "MB. (" + mapSlots + " slots)");
  log.info("Each reducer needs: " + reduceMemory + "MB. (" + reduceSlots + " slots)");
  log.info("MapReduce Application Manager needs: " + amMemory + " MB. (" + appMasterSlots + " "
      + "slots)");
  log.info("Number of reducers: " + reducers);
  log.info("Max number of cluster map tasks: " + mappers);

  if (mappers < 1) {
    log.warn("The calculated max number of concurrent map tasks is less than 1. Use 1 instead.");
    mappers = 1;
  }

  return mappers;
}

Java Code Examples for org.apache.hadoop.mapred.JobConf#getNumReduceTasks()