org.apache.hadoop.mapred.JobConf#getInt

Source File: LinkRank.java From anthelion with Apache License 2.0

6 votes

/**
 * Configures the job, sets the damping factor, rank one score, and other
 * needed values for analysis.
 */
public void configure(JobConf conf) {

  try {
    this.conf = conf;
    this.dampingFactor = conf.getFloat("link.analyze.damping.factor", 0.85f);
    this.rankOne = conf.getFloat("link.analyze.rank.one", 0.0f);
    this.itNum = conf.getInt("link.analyze.iteration", 0);
    limitPages = conf.getBoolean("link.ignore.limit.page", true);
    limitDomains = conf.getBoolean("link.ignore.limit.domain", true);
  }
  catch (Exception e) {
    LOG.error(StringUtils.stringifyException(e));
    throw new IllegalArgumentException(e);
  }
}

Source File: AvroAsJsonOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

6 votes

static <K> void configureDataFileWriter(DataFileWriter<K> writer,
    JobConf job) throws UnsupportedEncodingException {

    if (FileOutputFormat.getCompressOutput(job)) {
        int level = job.getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY,
                org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL);
        String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC) ?
            CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName);
        writer.setCodec(factory);
    }

    writer.setSyncInterval(job.getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY,
            DEFAULT_SYNC_INTERVAL));

    // copy metadata from job
    for (Map.Entry<String,String> e : job) {
        if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),e.getValue());
        if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                   URLDecoder.decode(e.getValue(), "ISO-8859-1")
                   .getBytes("ISO-8859-1"));
    }
}

Source File: ImportRecordReaderFactory.java From emr-dynamodb-connector with Apache License 2.0

6 votes

static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}

Source File: DistCp.java From hadoop-gpu with Apache License 2.0

6 votes

/** Mapper configuration.
 * Extracts source and destination file system, as well as
 * top-level paths on source and destination directories.
 * Gets the named file systems, to be used later in map.
 */
public void configure(JobConf job)
{
  destPath = new Path(job.get(DST_DIR_LABEL, "/"));
  try {
    destFileSys = destPath.getFileSystem(job);
  } catch (IOException ex) {
    throw new RuntimeException("Unable to get the named file system.", ex);
  }
  sizeBuf = job.getInt("copy.buf.size", 128 * 1024);
  buffer = new byte[sizeBuf];
  ignoreReadFailures = job.getBoolean(Options.IGNORE_READ_FAILURES.propertyname, false);
  preserve_status = job.getBoolean(Options.PRESERVE_STATUS.propertyname, false);
  if (preserve_status) {
    preseved = FileAttribute.parse(job.get(PRESERVE_STATUS_LABEL));
  }
  update = job.getBoolean(Options.UPDATE.propertyname, false);
  overwrite = !update && job.getBoolean(Options.OVERWRITE.propertyname, false);
  this.job = job;
}

Source File: MultithreadedMapRunner.java From hadoop-gpu with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
public void configure(JobConf jobConf) {
  int numberOfThreads =
    jobConf.getInt("mapred.map.multithreadedrunner.threads", 10);
  if (LOG.isDebugEnabled()) {
    LOG.debug("Configuring jobConf " + jobConf.getJobName() +
              " to use " + numberOfThreads + " threads");
  }

  this.job = jobConf;
  //increment processed counter only if skipping feature is enabled
  this.incrProcCount = SkipBadRecords.getMapperMaxSkipRecords(job)>0 && 
    SkipBadRecords.getAutoIncrMapperProcCount(job);
  this.mapper = ReflectionUtils.newInstance(jobConf.getMapperClass(),
      jobConf);

  // Creating a threadpool of the configured size to execute the Mapper
  // map method in parallel.
  executorService = new ThreadPoolExecutor(numberOfThreads, numberOfThreads, 
                                           0L, TimeUnit.MILLISECONDS,
                                           new BlockingArrayQueue
                                             (numberOfThreads));
}

Source File: MultithreadedMapRunner.java From hadoop with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
public void configure(JobConf jobConf) {
  int numberOfThreads =
    jobConf.getInt(MultithreadedMapper.NUM_THREADS, 10);
  if (LOG.isDebugEnabled()) {
    LOG.debug("Configuring jobConf " + jobConf.getJobName() +
              " to use " + numberOfThreads + " threads");
  }

  this.job = jobConf;
  //increment processed counter only if skipping feature is enabled
  this.incrProcCount = SkipBadRecords.getMapperMaxSkipRecords(job)>0 && 
    SkipBadRecords.getAutoIncrMapperProcCount(job);
  this.mapper = ReflectionUtils.newInstance(jobConf.getMapperClass(),
      jobConf);

  // Creating a threadpool of the configured size to execute the Mapper
  // map method in parallel.
  executorService = new ThreadPoolExecutor(numberOfThreads, numberOfThreads, 
                                           0L, TimeUnit.MILLISECONDS,
                                           new BlockingArrayQueue
                                             (numberOfThreads));
}

Source File: ValueAggregatorJobBase.java From big-c with Apache License 2.0

5 votes

private static ArrayList<ValueAggregatorDescriptor> getAggregatorDescriptors(JobConf job) {
  String advn = "aggregator.descriptor";
  int num = job.getInt(advn + ".num", 0);
  ArrayList<ValueAggregatorDescriptor> retv = new ArrayList<ValueAggregatorDescriptor>(num);
  for (int i = 0; i < num; i++) {
    String spec = job.get(advn + "." + i);
    ValueAggregatorDescriptor ad = getValueAggregatorDescriptor(spec, job);
    if (ad != null) {
      retv.add(ad);
    }
  }
  return retv;
}

Source File: RandomWriter.java From hadoop-gpu with Apache License 2.0

5 votes

/**
 * Save the values out of the configuaration that we need to write
 * the data.
 */
@Override
public void configure(JobConf job) {
  numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map",
                                1*1024*1024*1024);
  minKeySize = job.getInt("test.randomwrite.min_key", 10);
  keySizeRange = 
    job.getInt("test.randomwrite.max_key", 1000) - minKeySize;
  minValueSize = job.getInt("test.randomwrite.min_value", 0);
  valueSizeRange = 
    job.getInt("test.randomwrite.max_value", 20000) - minValueSize;
}

Source File: FreeGenerator.java From nutch-htmlunit with Apache License 2.0

5 votes

@Override
public void configure(JobConf job) {
  super.configure(job);
  defaultInterval = job.getInt("db.fetch.interval.default", 0);
  scfilters = new ScoringFilters(job);
  if (job.getBoolean(FILTER_KEY, false)) {
    filters = new URLFilters(job);
  }
  if (job.getBoolean(NORMALIZE_KEY, false)) {
    normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
  }
}

Source File: DistCp.java From RDFS with Apache License 2.0

5 votes

/**
 * Produce splits such that each is no greater than the quotient of the
 * total size and the number of splits requested.
 * @param job The handle to the JobConf object
 * @param numSplits Number of splits requested
 */
public InputSplit[] getSplits(JobConf job, int numSplits)
    throws IOException {
  int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
  long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
  String srcFileList = job.get(SRC_LIST_LABEL, "");
  Path srcFileListPath = new Path(srcFileList);
  if (cnfiles < 0 || cbsize < 0 || "".equals(srcFileList)) {
    throw new RuntimeException("Invalid metadata: #files(" + cnfiles +
        ") total_size(" + cbsize + ") src_chunk_file_list_uri(" +
        srcFileList + ")");
  }
  ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
  SequenceFile.Reader sl = null;
  String splitList = job.get(SPLIT_LIST_LABEL, "");
  if("".equals(splitList)) {
    throw new RuntimeException("Invalid metadata: split_list_uri(" +
        srcFileList + ")");
  }
  //split file list which contains start pos and split length pairs
  //they are used to split srcChunkFileList
  Path splitListPath = new Path(splitList);        
  FileSystem splitListFs = splitListPath.getFileSystem(job);
  try{
    sl = new SequenceFile.Reader(splitListFs, splitListPath, job);
    LongWritable startpos = new LongWritable();
    LongWritable length = new LongWritable();
    while (sl.next(startpos, length)) {
      splits.add(new FileSplit(srcFileListPath, startpos.get(), 
          length.get(), (String[])null));
    }
  }
  finally{
    checkAndClose(sl);
  }
  return splits.toArray(new FileSplit[splits.size()]);
}

Source File: RandomWriter.java From hadoop-book with Apache License 2.0

5 votes

/**
 * Save the values out of the configuaration that we need to write the
 * data.
 */
@Override
public void configure(JobConf job) {
    numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map",
            1 * 1024 * 1024 * 1024);
    minKeySize = job.getInt("test.randomwrite.min_key", 10);
    keySizeRange =
            job.getInt("test.randomwrite.max_key", 1000) - minKeySize;
    minValueSize = job.getInt("test.randomwrite.min_value", 0);
    valueSizeRange =
            job.getInt("test.randomwrite.max_value", 20000) - minValueSize;
}

Source File: AvroRecordWriter.java From spork with Apache License 2.0

5 votes

static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer,
    JobConf job) throws UnsupportedEncodingException {
  if (FileOutputFormat.getCompressOutput(job)) {
    int level = job.getInt(DEFLATE_LEVEL_KEY,
        DEFAULT_DEFLATE_LEVEL);
    String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
    CodecFactory factory = codecName.equals(DEFLATE_CODEC)
      ? CodecFactory.deflateCodec(level)
      : CodecFactory.fromString(codecName);
    writer.setCodec(factory);
  }

  // Do max as core-default.xml has io.file.buffer.size as 4K
  writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max(
          job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

  // copy metadata from job
  for (Map.Entry<String,String> e : job) {
    if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),
                     e.getValue());
    if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                     URLDecoder.decode(e.getValue(), "ISO-8859-1")
                     .getBytes("ISO-8859-1"));
  }
}

Source File: RandomWriter.java From RDFS with Apache License 2.0

5 votes

/**
 * Save the values out of the configuaration that we need to write
 * the data.
 */
@Override
public void configure(JobConf job) {
  numBytesToWrite = job.getLong("test.randomwrite.bytes_per_map",
                                1*1024*1024*1024);
  minKeySize = job.getInt("test.randomwrite.min_key", 10);
  keySizeRange = 
    job.getInt("test.randomwrite.max_key", 1000) - minKeySize;
  minValueSize = job.getInt("test.randomwrite.min_value", 0);
  valueSizeRange = 
    job.getInt("test.randomwrite.max_value", 20000) - minValueSize;
}

Source File: NLineInputFormat.java From big-c with Apache License 2.0

4 votes

public void configure(JobConf conf) {
  N = conf.getInt("mapreduce.input.lineinputformat.linespermap", 1);
}

Source File: AbstractMROldApiSaveTest.java From elasticsearch-hadoop with Apache License 2.0

4 votes

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    return super.getSplits(job, job.getInt("actual.splits", 3));
}

Source File: GenerateDistCacheData.java From big-c with Apache License 2.0

4 votes

@Override
public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
  final JobConf jobConf = new JobConf(jobCtxt.getConfiguration());
  final JobClient client = new JobClient(jobConf);
  ClusterStatus stat = client.getClusterStatus(true);
  int numTrackers = stat.getTaskTrackers();
  final int fileCount = jobConf.getInt(GRIDMIX_DISTCACHE_FILE_COUNT, -1);

  // Total size of distributed cache files to be generated
  final long totalSize = jobConf.getLong(GRIDMIX_DISTCACHE_BYTE_COUNT, -1);
  // Get the path of the special file
  String distCacheFileList = jobConf.get(GRIDMIX_DISTCACHE_FILE_LIST);
  if (fileCount < 0 || totalSize < 0 || distCacheFileList == null) {
    throw new RuntimeException("Invalid metadata: #files (" + fileCount
        + "), total_size (" + totalSize + "), filelisturi ("
        + distCacheFileList + ")");
  }

  Path sequenceFile = new Path(distCacheFileList);
  FileSystem fs = sequenceFile.getFileSystem(jobConf);
  FileStatus srcst = fs.getFileStatus(sequenceFile);
  // Consider the number of TTs * mapSlotsPerTracker as number of mappers.
  int numMapSlotsPerTracker = jobConf.getInt(TTConfig.TT_MAP_SLOTS, 2);
  int numSplits = numTrackers * numMapSlotsPerTracker;

  List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
  LongWritable key = new LongWritable();
  BytesWritable value = new BytesWritable();

  // Average size of data to be generated by each map task
  final long targetSize = Math.max(totalSize / numSplits,
                            DistributedCacheEmulator.AVG_BYTES_PER_MAP);
  long splitStartPosition = 0L;
  long splitEndPosition = 0L;
  long acc = 0L;
  long bytesRemaining = srcst.getLen();
  SequenceFile.Reader reader = null;
  try {
    reader = new SequenceFile.Reader(fs, sequenceFile, jobConf);
    while (reader.next(key, value)) {

      // If adding this file would put this split past the target size,
      // cut the last split and put this file in the next split.
      if (acc + key.get() > targetSize && acc != 0) {
        long splitSize = splitEndPosition - splitStartPosition;
        splits.add(new FileSplit(
            sequenceFile, splitStartPosition, splitSize, (String[])null));
        bytesRemaining -= splitSize;
        splitStartPosition = splitEndPosition;
        acc = 0L;
      }
      acc += key.get();
      splitEndPosition = reader.getPosition();
    }
  } finally {
    if (reader != null) {
      reader.close();
    }
  }
  if (bytesRemaining != 0) {
    splits.add(new FileSplit(
        sequenceFile, splitStartPosition, bytesRemaining, (String[])null));
  }

  return splits;
}

Source File: DistRaid.java From RDFS with Apache License 2.0

4 votes

/**
 * Produce splits such that each is no greater than the quotient of the
 * total size and the number of splits requested.
 * 
 * @param job
 *          The handle to the JobConf object
 * @param numSplits
 *          Number of splits requested
 */
public InputSplit[] getSplits(JobConf job, int numSplits)
    throws IOException {
  final int srcCount = job.getInt(OP_COUNT_LABEL, -1);
  final int targetcount = srcCount / numSplits;
  String srclist = job.get(OP_LIST_LABEL, "");
  if (srcCount < 0 || "".equals(srclist)) {
    throw new RuntimeException("Invalid metadata: #files(" + srcCount
        + ") listuri(" + srclist + ")");
  }
  Path srcs = new Path(srclist);
  FileSystem fs = srcs.getFileSystem(job);

  List<FileSplit> splits = new ArrayList<FileSplit>(numSplits);

  Text key = new Text();
  PolicyInfo value = new PolicyInfo();
  SequenceFile.Reader in = null;
  long prev = 0L;
  int count = 0; // count src
  try {
    for (in = new SequenceFile.Reader(fs, srcs, job); in.next(key, value);) {
      long curr = in.getPosition();
      long delta = curr - prev;
      if (++count > targetcount) {
        count = 0;
        splits.add(new FileSplit(srcs, prev, delta, (String[]) null));
        prev = curr;
      }
    }
  } finally {
    in.close();
  }
  long remaining = fs.getFileStatus(srcs).getLen() - prev;
  if (remaining != 0) {
    splits.add(new FileSplit(srcs, prev, remaining, (String[]) null));
  }
  LOG.info("jobname= " + jobName + " numSplits=" + numSplits + 
           ", splits.size()=" + splits.size());
  return splits.toArray(new FileSplit[splits.size()]);
}

Source File: RegexMapper.java From big-c with Apache License 2.0

4 votes

public void configure(JobConf job) {
  pattern = Pattern.compile(job.get(org.apache.hadoop.mapreduce.lib.map.
              RegexMapper.PATTERN));
  group = job.getInt(org.apache.hadoop.mapreduce.lib.map.
            RegexMapper.GROUP, 0);
}

Source File: DistCp.java From RDFS with Apache License 2.0

4 votes

/**
 * Produce splits such that each is no greater than the quotient of the
 * total size and the number of splits requested.
 * @param job The handle to the JobConf object
 * @param numSplits Number of splits requested
 */
public InputSplit[] getSplits(JobConf job, int numSplits)
    throws IOException {
  int cnfiles = job.getInt(SRC_COUNT_LABEL, -1);
  long cbsize = job.getLong(TOTAL_SIZE_LABEL, -1);
  long blocks = job.getLong(TOTAL_BLOCKS_LABEL, -1);
  String srcfilelist = job.get(SRC_LIST_LABEL, "");
  if (cnfiles < 0 || cbsize < 0 || blocks < 0 || "".equals(srcfilelist)) {
    throw new RuntimeException("Invalid metadata: #files(" + cnfiles +
                               ") total_size(" + cbsize + ") listuri(" +
                               srcfilelist + ")");
  }
  Path src = new Path(srcfilelist);
  FileSystem fs = src.getFileSystem(job);
  FileStatus srcst = fs.getFileStatus(src);

  ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
  LongWritable key = new LongWritable();
  FilePairComparable value = new FilePairComparable();
  final long targetsize = getTargetSize(job, numSplits);
  long pos = 0L;
  long last = 0L;
  long acc = 0L;
  long cbrem = srcst.getLen();
  SequenceFile.Reader sl = null;
  try {
    sl = new SequenceFile.Reader(fs, src, job);
    for (; sl.next(key, value); last = sl.getPosition()) {
      // if adding this split would put this split past the target size,
      // cut the last split and put this next file in the next split.
      long increment = getIncrement(key, value);
      if (acc + increment > targetsize && acc != 0) {
        long splitsize = last - pos;
        splits.add(new FileSplit(src, pos, splitsize, (String[])null));
        cbrem -= splitsize;
        pos = last;
        acc = 0L;
      }
      acc += increment;
    }
  }
  finally {
    checkAndClose(sl);
  }
  if (cbrem != 0) {
    splits.add(new FileSplit(src, pos, cbrem, (String[])null));
  }

  return splits.toArray(new FileSplit[splits.size()]);
}

Source File: TaskCalculator.java From emr-dynamodb-connector with Apache License 2.0

4 votes

public int getMaxMapTasks() throws IOException {
  JobConf conf = (JobConf) jobClient.getConf();

  // Total number of nodes in the cluster
  int nodes = jobClient.getClusterStatus().getTaskTrackers();
  log.info("Cluster has " + nodes + " active nodes.");
  if (nodes == 0) {
    log.warn("Cluster doesn't have any nodes");
    return 0;
  }

  // Memory per slot
  int slotMemory = conf.getInt("yarn.scheduler.minimum-allocation-mb", 1024); // Default value
  // from yarn-default.xml

  // Number of slots in a core node
  int nodeMemory = nodeCapacityProvider.getCoreNodeMemoryMB();
  int nodeSlots = nodeMemory / slotMemory;

  // Number of slots for a mapper
  int mapMemory = conf.getInt(MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB);
  int mapSlots = (int) Math.ceil((double) mapMemory / slotMemory);

  // Number of slots for an application master
  int amMemory = conf.getInt(MRJobConfig.MR_AM_VMEM_MB, MRJobConfig.DEFAULT_MR_AM_VMEM_MB);
  int appMasterSlots = (int) Math.ceil((double) amMemory / slotMemory);

  // Number of slots for a reducer
  int reduceMemory = conf.getInt(MRJobConfig.REDUCE_MEMORY_MB, MRJobConfig
      .DEFAULT_REDUCE_MEMORY_MB);
  int reduceSlots = (int) Math.ceil((double) reduceMemory / slotMemory);

  // Number of reducers
  int reducers = conf.getNumReduceTasks();

  // Calculate the number of mappers
  int mappers = yarnContainerAllocator.getMaxMappers(nodes, reducers, nodeSlots,
      appMasterSlots, mapSlots, reduceSlots);

  log.info("Slot size: " + slotMemory + "MB.");
  log.info("Node manager can allocate " + nodeMemory + "MB (" + nodeSlots + " slots) for "
      + "containers on each node.");
  log.info("Each mapper needs: " + mapMemory + "MB. (" + mapSlots + " slots)");
  log.info("Each reducer needs: " + reduceMemory + "MB. (" + reduceSlots + " slots)");
  log.info("MapReduce Application Manager needs: " + amMemory + " MB. (" + appMasterSlots + " "
      + "slots)");
  log.info("Number of reducers: " + reducers);
  log.info("Max number of cluster map tasks: " + mappers);

  if (mappers < 1) {
    log.warn("The calculated max number of concurrent map tasks is less than 1. Use 1 instead.");
    mappers = 1;
  }

  return mappers;
}

Java Code Examples for org.apache.hadoop.mapred.JobConf#getInt()