org.apache.hadoop.mapreduce.TaskAttemptContext#getTaskAttemptID

Source File: FileOutputCommitter.java From hadoop-gpu with Apache License 2.0

6 votes

/**
 * Move the files from the work directory to the job output directory
 * @param context the task context
 */
public void commitTask(TaskAttemptContext context) 
throws IOException {
  TaskAttemptID attemptId = context.getTaskAttemptID();
  if (workPath != null) {
    context.progress();
    if (outputFileSystem.exists(workPath)) {
      // Move the task outputs to their final place
      moveTaskOutputs(context, outputFileSystem, outputPath, workPath);
      // Delete the temporary task-specific output directory
      if (!outputFileSystem.delete(workPath, true)) {
        LOG.warn("Failed to delete the temporary output" + 
        " directory of task: " + attemptId + " - " + workPath);
      }
      LOG.info("Saved output of task '" + attemptId + "' to " + 
               outputPath);
    }
  }
}

Source File: TestCombineTextInputFormat.java From big-c with Apache License 2.0

6 votes

private static List<Text> readSplit(InputFormat<LongWritable,Text> format,
  InputSplit split, Job job) throws IOException, InterruptedException {
  List<Text> result = new ArrayList<Text>();
  Configuration conf = job.getConfiguration();
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(conf);
  RecordReader<LongWritable, Text> reader = format.createRecordReader(split,
    MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
  MapContext<LongWritable,Text,LongWritable,Text> mcontext =
    new MapContextImpl<LongWritable,Text,LongWritable,Text>(conf,
    context.getTaskAttemptID(), reader, null, null,
    MapReduceTestUtil.createDummyReporter(),
    split);
  reader.initialize(split, mcontext);
  while (reader.nextKeyValue()) {
    result.add(new Text(reader.getCurrentValue()));
  }
  return result;
}

Source File: TestCombineTextInputFormat.java From hadoop with Apache License 2.0

6 votes

private static List<Text> readSplit(InputFormat<LongWritable,Text> format,
  InputSplit split, Job job) throws IOException, InterruptedException {
  List<Text> result = new ArrayList<Text>();
  Configuration conf = job.getConfiguration();
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(conf);
  RecordReader<LongWritable, Text> reader = format.createRecordReader(split,
    MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
  MapContext<LongWritable,Text,LongWritable,Text> mcontext =
    new MapContextImpl<LongWritable,Text,LongWritable,Text>(conf,
    context.getTaskAttemptID(), reader, null, null,
    MapReduceTestUtil.createDummyReporter(),
    split);
  reader.initialize(split, mcontext);
  while (reader.nextKeyValue()) {
    result.add(new Text(reader.getCurrentValue()));
  }
  return result;
}

Source File: SpliceOutputCommitter.java From spliceengine with GNU Affero General Public License v3.0

5 votes

@Override
public void abortTask(TaskAttemptContext taskContext) throws IOException {
    if (LOG.isDebugEnabled())
        SpliceLogUtils.debug(LOG,"abortTask " + taskContext.getTaskAttemptID());
    TxnView txn = currentTxn.get();
    if (txn == null)
        throw new IOException("no transaction associated with task attempt Id "+taskContext.getTaskAttemptID());
    SIDriver.driver().lifecycleManager().rollback(txn.getTxnId());
}

Source File: TestFixedLengthInputFormat.java From big-c with Apache License 2.0

5 votes

private static List<String> readSplit(FixedLengthInputFormat format, 
                                      InputSplit split, 
                                      Job job) throws Exception {
  List<String> result = new ArrayList<String>();
  TaskAttemptContext context = MapReduceTestUtil.
      createDummyMapTaskAttemptContext(job.getConfiguration());
  RecordReader<LongWritable, BytesWritable> reader =
      format.createRecordReader(split, context);
  MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable>
      mcontext =
      new MapContextImpl<LongWritable, BytesWritable, LongWritable,
      BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(),
      reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
  LongWritable key;
  BytesWritable value;
  try {
    reader.initialize(split, mcontext);
    while (reader.nextKeyValue()) {
      key = reader.getCurrentKey();
      value = reader.getCurrentValue();
      result.add(new String(value.getBytes(), 0, value.getLength()));
    }
  } finally {
    reader.close();
  }
  return result;
}

Source File: TestDistCacheEmulation.java From hadoop with Apache License 2.0

5 votes

/**
 * Validate setupGenerateDistCacheData by validating <li>permissions of the
 * distributed cache directories and <li>content of the generated sequence
 * file. This includes validation of dist cache file paths and their file
 * sizes.
 */
private void validateSetupGenDC(Configuration jobConf, long[] sortedFileSizes)
    throws IOException, InterruptedException {
  // build things needed for validation
  long sumOfFileSizes = 0;
  for (int i = 0; i < sortedFileSizes.length; i++) {
    sumOfFileSizes += sortedFileSizes[i];
  }

  FileSystem fs = FileSystem.get(jobConf);
  assertEquals("Number of distributed cache files to be generated is wrong.",
      sortedFileSizes.length,
      jobConf.getInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, -1));
  assertEquals("Total size of dist cache files to be generated is wrong.",
      sumOfFileSizes,
      jobConf.getLong(GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, -1));
  Path filesListFile = new Path(
      jobConf.get(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST));
  FileStatus stat = fs.getFileStatus(filesListFile);
  assertEquals("Wrong permissions of dist Cache files list file "
      + filesListFile, new FsPermission((short) 0644), stat.getPermission());

  InputSplit split = new FileSplit(filesListFile, 0, stat.getLen(),
      (String[]) null);
  TaskAttemptContext taskContext = MapReduceTestUtil
      .createDummyMapTaskAttemptContext(jobConf);
  RecordReader<LongWritable, BytesWritable> reader = new GenerateDistCacheData.GenDCDataFormat()
      .createRecordReader(split, taskContext);
  MapContext<LongWritable, BytesWritable, NullWritable, BytesWritable> mapContext = new MapContextImpl<LongWritable, BytesWritable, NullWritable, BytesWritable>(
      jobConf, taskContext.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
  reader.initialize(split, mapContext);

  // start validating setupGenerateDistCacheData
  doValidateSetupGenDC(reader, fs, sortedFileSizes);
}

Source File: PartialFileOutputCommitter.java From hadoop with Apache License 2.0

5 votes

@Override
public void cleanUpPartialOutputForTask(TaskAttemptContext context)
    throws IOException {

  // we double check this is never invoked from a non-preemptable subclass.
  // This should never happen, since the invoking codes is checking it too,
  // but it is safer to double check. Errors handling this would produce
  // inconsistent output.

  if (!this.getClass().isAnnotationPresent(Checkpointable.class)) {
    throw new IllegalStateException("Invoking cleanUpPartialOutputForTask() " +
        "from non @Preemptable class");
  }
  FileSystem fs =
    fsFor(getTaskAttemptPath(context), context.getConfiguration());

  LOG.info("cleanUpPartialOutputForTask: removing everything belonging to " +
      context.getTaskAttemptID().getTaskID() + " in: " +
      getCommittedTaskPath(context).getParent());

  final TaskAttemptID taid = context.getTaskAttemptID();
  final TaskID tid = taid.getTaskID();
  Path pCommit = getCommittedTaskPath(context).getParent();
  // remove any committed output
  for (int i = 0; i < taid.getId(); ++i) {
    TaskAttemptID oldId = new TaskAttemptID(tid, i);
    Path pTask = new Path(pCommit, oldId.toString());
    if (fs.exists(pTask) && !fs.delete(pTask, true)) {
      throw new IOException("Failed to delete " + pTask);
    }
  }
}

Source File: BlurOutputCommitter.java From incubator-retired-blur with Apache License 2.0

5 votes

private Conf setup(TaskAttemptContext context) throws IOException {
  LOG.info("Setting up committer with task attempt [{0}]", context.getTaskAttemptID().toString());
  Conf conf = new Conf();
  conf._configuration = context.getConfiguration();
  conf._tableDescriptor = BlurOutputFormat.getTableDescriptor(conf._configuration);
  int shardCount = conf._tableDescriptor.getShardCount();
  int attemptId = context.getTaskAttemptID().getTaskID().getId();
  int shardId = attemptId % shardCount;
  conf._taskAttemptID = context.getTaskAttemptID();
  Path tableOutput = BlurOutputFormat.getOutputPath(conf._configuration);
  String shardName = ShardUtil.getShardName(BlurConstants.SHARD_PREFIX, shardId);
  conf._indexPath = new Path(tableOutput, shardName);
  conf._newIndex = new Path(conf._indexPath, conf._taskAttemptID.toString() + ".tmp");
  return conf;
}

Source File: TestDistCacheEmulation.java From big-c with Apache License 2.0

5 votes

/**
 * Validate setupGenerateDistCacheData by validating <li>permissions of the
 * distributed cache directories and <li>content of the generated sequence
 * file. This includes validation of dist cache file paths and their file
 * sizes.
 */
private void validateSetupGenDC(Configuration jobConf, long[] sortedFileSizes)
    throws IOException, InterruptedException {
  // build things needed for validation
  long sumOfFileSizes = 0;
  for (int i = 0; i < sortedFileSizes.length; i++) {
    sumOfFileSizes += sortedFileSizes[i];
  }

  FileSystem fs = FileSystem.get(jobConf);
  assertEquals("Number of distributed cache files to be generated is wrong.",
      sortedFileSizes.length,
      jobConf.getInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, -1));
  assertEquals("Total size of dist cache files to be generated is wrong.",
      sumOfFileSizes,
      jobConf.getLong(GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, -1));
  Path filesListFile = new Path(
      jobConf.get(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST));
  FileStatus stat = fs.getFileStatus(filesListFile);
  assertEquals("Wrong permissions of dist Cache files list file "
      + filesListFile, new FsPermission((short) 0644), stat.getPermission());

  InputSplit split = new FileSplit(filesListFile, 0, stat.getLen(),
      (String[]) null);
  TaskAttemptContext taskContext = MapReduceTestUtil
      .createDummyMapTaskAttemptContext(jobConf);
  RecordReader<LongWritable, BytesWritable> reader = new GenerateDistCacheData.GenDCDataFormat()
      .createRecordReader(split, taskContext);
  MapContext<LongWritable, BytesWritable, NullWritable, BytesWritable> mapContext = new MapContextImpl<LongWritable, BytesWritable, NullWritable, BytesWritable>(
      jobConf, taskContext.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
  reader.initialize(split, mapContext);

  // start validating setupGenerateDistCacheData
  doValidateSetupGenDC(reader, fs, sortedFileSizes);
}

Source File: TestFixedLengthInputFormat.java From hadoop with Apache License 2.0

5 votes

private static List<String> readSplit(FixedLengthInputFormat format, 
                                      InputSplit split, 
                                      Job job) throws Exception {
  List<String> result = new ArrayList<String>();
  TaskAttemptContext context = MapReduceTestUtil.
      createDummyMapTaskAttemptContext(job.getConfiguration());
  RecordReader<LongWritable, BytesWritable> reader =
      format.createRecordReader(split, context);
  MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable>
      mcontext =
      new MapContextImpl<LongWritable, BytesWritable, LongWritable,
      BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(),
      reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
  LongWritable key;
  BytesWritable value;
  try {
    reader.initialize(split, mcontext);
    while (reader.nextKeyValue()) {
      key = reader.getCurrentKey();
      value = reader.getCurrentValue();
      result.add(new String(value.getBytes(), 0, value.getLength()));
    }
  } finally {
    reader.close();
  }
  return result;
}

Source File: TestFixedLengthInputFormat.java From hadoop with Apache License 2.0

5 votes

/**
 * Test with record length set to 0
 */
@Test (timeout=5000)
public void testZeroRecordLength() throws Exception {
  localFs.delete(workDir, true);
  Path file = new Path(workDir, new String("testFormat.txt"));
  createFile(file, null, 10, 10);
  Job job = Job.getInstance(defaultConf);
  // Set the fixed length record length config property 
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  format.setRecordLength(job.getConfiguration(), 0);
  FileInputFormat.setInputPaths(job, workDir);
  List<InputSplit> splits = format.getSplits(job);
  boolean exceptionThrown = false;
  for (InputSplit split : splits) {
    try {
      TaskAttemptContext context =
          MapReduceTestUtil.createDummyMapTaskAttemptContext(
          job.getConfiguration());
      RecordReader<LongWritable, BytesWritable> reader = 
          format.createRecordReader(split, context);
      MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable>
          mcontext =
          new MapContextImpl<LongWritable, BytesWritable, LongWritable,
          BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(),
          reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
      reader.initialize(split, mcontext);
    } catch(IOException ioe) {
      exceptionThrown = true;
      LOG.info("Exception message:" + ioe.getMessage());
    }
  }
  assertTrue("Exception for zero record length:", exceptionThrown);
}

Source File: TestFixedLengthInputFormat.java From hadoop with Apache License 2.0

5 votes

/**
 * Test with no record length set.
 */
@Test (timeout=5000)
public void testNoRecordLength() throws Exception {
  localFs.delete(workDir, true);
  Path file = new Path(workDir, new String("testFormat.txt"));
  createFile(file, null, 10, 10);
  // Create the job and do not set fixed record length
  Job job = Job.getInstance(defaultConf);
  FileInputFormat.setInputPaths(job, workDir);
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  List<InputSplit> splits = format.getSplits(job);
  boolean exceptionThrown = false;
  for (InputSplit split : splits) {
    try {
      TaskAttemptContext context = MapReduceTestUtil.
          createDummyMapTaskAttemptContext(job.getConfiguration());
      RecordReader<LongWritable, BytesWritable> reader =
          format.createRecordReader(split, context);
      MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable>
          mcontext =
          new MapContextImpl<LongWritable, BytesWritable, LongWritable,
          BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(),
          reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
      reader.initialize(split, mcontext);
    } catch(IOException ioe) {
      exceptionThrown = true;
      LOG.info("Exception message:" + ioe.getMessage());
    }
  }
  assertTrue("Exception for not setting record length:", exceptionThrown);
}

Source File: TestMRSequenceFileInputFilter.java From big-c with Apache License 2.0

5 votes

private int countRecords(int numSplits) 
    throws IOException, InterruptedException {
  InputFormat<Text, BytesWritable> format =
    new SequenceFileInputFilter<Text, BytesWritable>();
  if (numSplits == 0) {
    numSplits =
      random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
  }
  FileInputFormat.setMaxInputSplitSize(job, 
    fs.getFileStatus(inFile).getLen() / numSplits);
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // check each split
  int count = 0;
  for (InputSplit split : format.getSplits(job)) {
    RecordReader<Text, BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<Text, BytesWritable, Text, BytesWritable> mcontext = 
      new MapContextImpl<Text, BytesWritable, Text, BytesWritable>(
      job.getConfiguration(), 
      context.getTaskAttemptID(), reader, null, null, 
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        LOG.info("Accept record " + reader.getCurrentKey().toString());
        count++;
      }
    } finally {
      reader.close();
    }
  }
  return count;
}

Source File: TestFixedLengthInputFormat.java From big-c with Apache License 2.0

5 votes

/**
 * Test with no record length set.
 */
@Test (timeout=5000)
public void testNoRecordLength() throws Exception {
  localFs.delete(workDir, true);
  Path file = new Path(workDir, new String("testFormat.txt"));
  createFile(file, null, 10, 10);
  // Create the job and do not set fixed record length
  Job job = Job.getInstance(defaultConf);
  FileInputFormat.setInputPaths(job, workDir);
  FixedLengthInputFormat format = new FixedLengthInputFormat();
  List<InputSplit> splits = format.getSplits(job);
  boolean exceptionThrown = false;
  for (InputSplit split : splits) {
    try {
      TaskAttemptContext context = MapReduceTestUtil.
          createDummyMapTaskAttemptContext(job.getConfiguration());
      RecordReader<LongWritable, BytesWritable> reader =
          format.createRecordReader(split, context);
      MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable>
          mcontext =
          new MapContextImpl<LongWritable, BytesWritable, LongWritable,
          BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(),
          reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
      reader.initialize(split, mcontext);
    } catch(IOException ioe) {
      exceptionThrown = true;
      LOG.info("Exception message:" + ioe.getMessage());
    }
  }
  assertTrue("Exception for not setting record length:", exceptionThrown);
}

Source File: BlurOutputCommitter.java From incubator-retired-blur with Apache License 2.0

4 votes

@Override
public boolean needsTaskCommit(TaskAttemptContext context) throws IOException {
  int numReduceTasks = context.getNumReduceTasks();
  TaskAttemptID taskAttemptID = context.getTaskAttemptID();
  return taskAttemptID.isMap() && numReduceTasks != 0 ? false : true;
}

Source File: FileOutputCommitter.java From big-c with Apache License 2.0

4 votes

@Override
public void recoverTask(TaskAttemptContext context)
    throws IOException {
  if(hasOutputPath()) {
    context.progress();
    TaskAttemptID attemptId = context.getTaskAttemptID();
    int previousAttempt = getAppAttemptId(context) - 1;
    if (previousAttempt < 0) {
      throw new IOException ("Cannot recover task output for first attempt...");
    }

    Path previousCommittedTaskPath = getCommittedTaskPath(
        previousAttempt, context);
    FileSystem fs = previousCommittedTaskPath.getFileSystem(context.getConfiguration());
    if (LOG.isDebugEnabled()) {
      LOG.debug("Trying to recover task from " + previousCommittedTaskPath);
    }
    if (algorithmVersion == 1) {
      if (fs.exists(previousCommittedTaskPath)) {
        Path committedTaskPath = getCommittedTaskPath(context);
        if (fs.exists(committedTaskPath)) {
          if (!fs.delete(committedTaskPath, true)) {
            throw new IOException("Could not delete "+committedTaskPath);
          }
        }
        //Rename can fail if the parent directory does not yet exist.
        Path committedParent = committedTaskPath.getParent();
        fs.mkdirs(committedParent);
        if (!fs.rename(previousCommittedTaskPath, committedTaskPath)) {
          throw new IOException("Could not rename " + previousCommittedTaskPath +
              " to " + committedTaskPath);
        }
      } else {
          LOG.warn(attemptId+" had no output to recover.");
      }
    } else {
      // essentially a no-op, but for backwards compatibility
      // after upgrade to the new fileOutputCommitter,
      // check if there are any output left in committedTaskPath
      if (fs.exists(previousCommittedTaskPath)) {
        LOG.info("Recovering task for upgrading scenario, moving files from "
            + previousCommittedTaskPath + " to " + outputPath);
        FileStatus from = fs.getFileStatus(previousCommittedTaskPath);
        mergePaths(fs, from, outputPath);
      }
      LOG.info("Done recovering task " + attemptId);
    }
  } else {
    LOG.warn("Output Path is null in recoverTask()");
  }
}

Source File: TestMRKeyValueTextInputFormat.java From big-c with Apache License 2.0

4 votes

@Test
public void testSplitableCodecs() throws Exception {
  final Job job = Job.getInstance(defaultConf);
  final Configuration conf = job.getConfiguration();

  // Create the codec
  CompressionCodec codec = null;
  try {
    codec = (CompressionCodec)
    ReflectionUtils.newInstance(conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
  } catch (ClassNotFoundException cnfe) {
    throw new IOException("Illegal codec!");
  }
  Path file = new Path(workDir, "test"+codec.getDefaultExtension());

  int seed = new Random().nextInt();
  LOG.info("seed = " + seed);
  Random random = new Random(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int MAX_LENGTH = 500000;
  FileInputFormat.setMaxInputSplitSize(job, MAX_LENGTH / 20);
  // for a variety of lengths
  for (int length = 0; length < MAX_LENGTH;
       length += random.nextInt(MAX_LENGTH / 4) + 1) {

    LOG.info("creating; entries = " + length);

    // create a file with length entries
    Writer writer =
      new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
    try {
      for (int i = 0; i < length; i++) {
        writer.write(Integer.toString(i * 2));
        writer.write("\t");
        writer.write(Integer.toString(i));
        writer.write("\n");
      }
    } finally {
      writer.close();
    }

    // try splitting the file in a variety of sizes
    KeyValueTextInputFormat format = new KeyValueTextInputFormat();
    assertTrue("KVTIF claims not splittable", format.isSplitable(job, file));
    for (int i = 0; i < 3; i++) {
      int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
      LOG.info("splitting: requesting = " + numSplits);
      List<InputSplit> splits = format.getSplits(job);
      LOG.info("splitting: got =        " + splits.size());

      // check each split
      BitSet bits = new BitSet(length);
      for (int j = 0; j < splits.size(); j++) {
        LOG.debug("split["+j+"]= " + splits.get(j));
        TaskAttemptContext context = MapReduceTestUtil.
          createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<Text, Text> reader = format.createRecordReader(
          splits.get(j), context);
        Class<?> clazz = reader.getClass();
        MapContext<Text, Text, Text, Text> mcontext =
          new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(),
          context.getTaskAttemptID(), reader, null, null,
          MapReduceTestUtil.createDummyReporter(), splits.get(j));
        reader.initialize(splits.get(j), mcontext);

        Text key = null;
        Text value = null;
        try {
          int count = 0;
          while (reader.nextKeyValue()) {
            key = reader.getCurrentKey();
            value = reader.getCurrentValue();
            final int k = Integer.parseInt(key.toString());
            final int v = Integer.parseInt(value.toString());
            assertEquals("Bad key", 0, k % 2);
            assertEquals("Mismatched key/value", k / 2, v);
            LOG.debug("read " + k + "," + v);
            assertFalse(k + "," + v + " in multiple partitions.",bits.get(v));
            bits.set(v);
            count++;
          }
          if (count > 0) {
            LOG.info("splits["+j+"]="+splits.get(j)+" count=" + count);
          } else {
            LOG.debug("splits["+j+"]="+splits.get(j)+" count=" + count);
          }
        } finally {
          reader.close();
        }
      }
      assertEquals("Some keys in no partition.", length, bits.cardinality());
    }

  }
}

Source File: TestMRKeyValueTextInputFormat.java From hadoop with Apache License 2.0

4 votes

@Test
public void testSplitableCodecs() throws Exception {
  final Job job = Job.getInstance(defaultConf);
  final Configuration conf = job.getConfiguration();

  // Create the codec
  CompressionCodec codec = null;
  try {
    codec = (CompressionCodec)
    ReflectionUtils.newInstance(conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
  } catch (ClassNotFoundException cnfe) {
    throw new IOException("Illegal codec!");
  }
  Path file = new Path(workDir, "test"+codec.getDefaultExtension());

  int seed = new Random().nextInt();
  LOG.info("seed = " + seed);
  Random random = new Random(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int MAX_LENGTH = 500000;
  FileInputFormat.setMaxInputSplitSize(job, MAX_LENGTH / 20);
  // for a variety of lengths
  for (int length = 0; length < MAX_LENGTH;
       length += random.nextInt(MAX_LENGTH / 4) + 1) {

    LOG.info("creating; entries = " + length);

    // create a file with length entries
    Writer writer =
      new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
    try {
      for (int i = 0; i < length; i++) {
        writer.write(Integer.toString(i * 2));
        writer.write("\t");
        writer.write(Integer.toString(i));
        writer.write("\n");
      }
    } finally {
      writer.close();
    }

    // try splitting the file in a variety of sizes
    KeyValueTextInputFormat format = new KeyValueTextInputFormat();
    assertTrue("KVTIF claims not splittable", format.isSplitable(job, file));
    for (int i = 0; i < 3; i++) {
      int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
      LOG.info("splitting: requesting = " + numSplits);
      List<InputSplit> splits = format.getSplits(job);
      LOG.info("splitting: got =        " + splits.size());

      // check each split
      BitSet bits = new BitSet(length);
      for (int j = 0; j < splits.size(); j++) {
        LOG.debug("split["+j+"]= " + splits.get(j));
        TaskAttemptContext context = MapReduceTestUtil.
          createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<Text, Text> reader = format.createRecordReader(
          splits.get(j), context);
        Class<?> clazz = reader.getClass();
        MapContext<Text, Text, Text, Text> mcontext =
          new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(),
          context.getTaskAttemptID(), reader, null, null,
          MapReduceTestUtil.createDummyReporter(), splits.get(j));
        reader.initialize(splits.get(j), mcontext);

        Text key = null;
        Text value = null;
        try {
          int count = 0;
          while (reader.nextKeyValue()) {
            key = reader.getCurrentKey();
            value = reader.getCurrentValue();
            final int k = Integer.parseInt(key.toString());
            final int v = Integer.parseInt(value.toString());
            assertEquals("Bad key", 0, k % 2);
            assertEquals("Mismatched key/value", k / 2, v);
            LOG.debug("read " + k + "," + v);
            assertFalse(k + "," + v + " in multiple partitions.",bits.get(v));
            bits.set(v);
            count++;
          }
          if (count > 0) {
            LOG.info("splits["+j+"]="+splits.get(j)+" count=" + count);
          } else {
            LOG.debug("splits["+j+"]="+splits.get(j)+" count=" + count);
          }
        } finally {
          reader.close();
        }
      }
      assertEquals("Some keys in no partition.", length, bits.cardinality());
    }

  }
}

Source File: TestMRSequenceFileAsBinaryInputFormat.java From big-c with Apache License 2.0

4 votes

public void testBinary() throws IOException, InterruptedException {
  Job job = Job.getInstance();
  FileSystem fs = FileSystem.getLocal(job.getConfiguration());
  Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
  Path file = new Path(dir, "testbinary.seq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  fs.delete(dir, true);
  FileInputFormat.setInputPaths(job, dir);

  Text tkey = new Text();
  Text tval = new Text();

  SequenceFile.Writer writer = new SequenceFile.Writer(fs,
    job.getConfiguration(), file, Text.class, Text.class);
  try {
    for (int i = 0; i < RECORDS; ++i) {
      tkey.set(Integer.toString(r.nextInt(), 36));
      tval.set(Long.toString(r.nextLong(), 36));
      writer.append(tkey, tval);
    }
  } finally {
    writer.close();
  }
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  InputFormat<BytesWritable,BytesWritable> bformat =
    new SequenceFileAsBinaryInputFormat();

  int count = 0;
  r.setSeed(seed);
  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();
  Text cmpkey = new Text();
  Text cmpval = new Text();
  DataInputBuffer buf = new DataInputBuffer();
  FileInputFormat.setInputPaths(job, file);
  for (InputSplit split : bformat.getSplits(job)) {
    RecordReader<BytesWritable, BytesWritable> reader =
          bformat.createRecordReader(split, context);
    MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<BytesWritable, BytesWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        bkey = reader.getCurrentKey();
        bval = reader.getCurrentValue();
        tkey.set(Integer.toString(r.nextInt(), 36));
        tval.set(Long.toString(r.nextLong(), 36));
        buf.reset(bkey.getBytes(), bkey.getLength());
        cmpkey.readFields(buf);
        buf.reset(bval.getBytes(), bval.getLength());
        cmpval.readFields(buf);
        assertTrue(
          "Keys don't match: " + "*" + cmpkey.toString() + ":" +
          tkey.toString() + "*",
          cmpkey.toString().equals(tkey.toString()));
        assertTrue(
          "Vals don't match: " + "*" + cmpval.toString() + ":" +
          tval.toString() + "*",
          cmpval.toString().equals(tval.toString()));
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}

Source File: TestMRKeyValueTextInputFormat.java From big-c with Apache License 2.0

4 votes

@Test
public void testFormat() throws Exception {
  Job job = Job.getInstance(new Configuration(defaultConf));
  Path file = new Path(workDir, "test.txt");

  int seed = new Random().nextInt();
  LOG.info("seed = " + seed);
  Random random = new Random(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int MAX_LENGTH = 10000;
  // for a variety of lengths
  for (int length = 0; length < MAX_LENGTH;
       length += random.nextInt(MAX_LENGTH / 10) + 1) {

    LOG.debug("creating; entries = " + length);

    // create a file with length entries
    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
      for (int i = 0; i < length; i++) {
        writer.write(Integer.toString(i * 2));
        writer.write("\t");
        writer.write(Integer.toString(i));
        writer.write("\n");
      }
    } finally {
      writer.close();
    }

    // try splitting the file in a variety of sizes
    KeyValueTextInputFormat format = new KeyValueTextInputFormat();
    for (int i = 0; i < 3; i++) {
      int numSplits = random.nextInt(MAX_LENGTH / 20) + 1;
      LOG.debug("splitting: requesting = " + numSplits);
      List<InputSplit> splits = format.getSplits(job);
      LOG.debug("splitting: got =        " + splits.size());

      // check each split
      BitSet bits = new BitSet(length);
      for (int j = 0; j < splits.size(); j++) {
        LOG.debug("split["+j+"]= " + splits.get(j));
        TaskAttemptContext context = MapReduceTestUtil.
          createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<Text, Text> reader = format.createRecordReader(
          splits.get(j), context);
        Class<?> clazz = reader.getClass();
        assertEquals("reader class is KeyValueLineRecordReader.", 
          KeyValueLineRecordReader.class, clazz);
        MapContext<Text, Text, Text, Text> mcontext = 
          new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), 
          context.getTaskAttemptID(), reader, null, null, 
          MapReduceTestUtil.createDummyReporter(), splits.get(j));
        reader.initialize(splits.get(j), mcontext);

        Text key = null;
        Text value = null;
        try {
          int count = 0;
          while (reader.nextKeyValue()) {
            key = reader.getCurrentKey();
            clazz = key.getClass();
            assertEquals("Key class is Text.", Text.class, clazz);
            value = reader.getCurrentValue();
            clazz = value.getClass();
            assertEquals("Value class is Text.", Text.class, clazz);
            final int k = Integer.parseInt(key.toString());
            final int v = Integer.parseInt(value.toString());
            assertEquals("Bad key", 0, k % 2);
            assertEquals("Mismatched key/value", k / 2, v);
            LOG.debug("read " + v);
            assertFalse("Key in multiple partitions.", bits.get(v));
            bits.set(v);
            count++;
          }
          LOG.debug("splits[" + j + "]=" + splits.get(j) +" count=" + count);
        } finally {
          reader.close();
        }
      }
      assertEquals("Some keys in no partition.", length, bits.cardinality());
    }

  }
}

Java Code Examples for org.apache.hadoop.mapreduce.TaskAttemptContext#getTaskAttemptID()