org.apache.hadoop.mapred.IFile.Writer Java Exaples

Source File: TestCombineOutputCollector.java From hadoop with Apache License 2.0

6 votes

@Test
public void testCustomCollect() throws Throwable {
  //mock creation
  TaskReporter mockTaskReporter = mock(TaskReporter.class);

  @SuppressWarnings("unchecked")
  Writer<String, Integer> mockWriter = mock(Writer.class);

  Configuration conf = new Configuration();
  conf.set(MRJobConfig.COMBINE_RECORDS_BEFORE_PROGRESS, "2");
  
  coc = new CombineOutputCollector<String, Integer>(outCounter, mockTaskReporter, conf);
  coc.setWriter(mockWriter);
  verify(mockTaskReporter, never()).progress();

  coc.collect("dummy", 1);
  verify(mockTaskReporter, never()).progress();
  
  coc.collect("dummy", 2);
  verify(mockTaskReporter, times(1)).progress();
}

Source File: TestCombineOutputCollector.java From hadoop with Apache License 2.0

6 votes

@Test
public void testDefaultCollect() throws Throwable {
  //mock creation
  TaskReporter mockTaskReporter = mock(TaskReporter.class);

  @SuppressWarnings("unchecked")
  Writer<String, Integer> mockWriter = mock(Writer.class);

  Configuration conf = new Configuration();
  
  coc = new CombineOutputCollector<String, Integer>(outCounter, mockTaskReporter, conf);
  coc.setWriter(mockWriter);
  verify(mockTaskReporter, never()).progress();

  for(int i = 0; i < Task.DEFAULT_COMBINE_RECORDS_BEFORE_PROGRESS; i++) {
  	coc.collect("dummy", i);
  }
  verify(mockTaskReporter, times(1)).progress();
  for(int i = 0; i < Task.DEFAULT_COMBINE_RECORDS_BEFORE_PROGRESS; i++) {
  	coc.collect("dummy", i);
  }
  verify(mockTaskReporter, times(2)).progress();
}

Source File: TestCombineOutputCollector.java From big-c with Apache License 2.0

6 votes

@Test
public void testDefaultCollect() throws Throwable {
  //mock creation
  TaskReporter mockTaskReporter = mock(TaskReporter.class);

  @SuppressWarnings("unchecked")
  Writer<String, Integer> mockWriter = mock(Writer.class);

  Configuration conf = new Configuration();
  
  coc = new CombineOutputCollector<String, Integer>(outCounter, mockTaskReporter, conf);
  coc.setWriter(mockWriter);
  verify(mockTaskReporter, never()).progress();

  for(int i = 0; i < Task.DEFAULT_COMBINE_RECORDS_BEFORE_PROGRESS; i++) {
  	coc.collect("dummy", i);
  }
  verify(mockTaskReporter, times(1)).progress();
  for(int i = 0; i < Task.DEFAULT_COMBINE_RECORDS_BEFORE_PROGRESS; i++) {
  	coc.collect("dummy", i);
  }
  verify(mockTaskReporter, times(2)).progress();
}

Source File: TestCombineOutputCollector.java From big-c with Apache License 2.0

6 votes

@Test
public void testCustomCollect() throws Throwable {
  //mock creation
  TaskReporter mockTaskReporter = mock(TaskReporter.class);

  @SuppressWarnings("unchecked")
  Writer<String, Integer> mockWriter = mock(Writer.class);

  Configuration conf = new Configuration();
  conf.set(MRJobConfig.COMBINE_RECORDS_BEFORE_PROGRESS, "2");
  
  coc = new CombineOutputCollector<String, Integer>(outCounter, mockTaskReporter, conf);
  coc.setWriter(mockWriter);
  verify(mockTaskReporter, never()).progress();

  coc.collect("dummy", 1);
  verify(mockTaskReporter, never()).progress();
  
  coc.collect("dummy", 2);
  verify(mockTaskReporter, times(1)).progress();
}

Source File: MergeManagerImpl.java From big-c with Apache License 2.0

5 votes

@Override
public void merge(List<InMemoryMapOutput<K, V>> inputs) throws IOException {
  if (inputs == null || inputs.size() == 0) {
    return;
  }

  TaskAttemptID dummyMapId = inputs.get(0).getMapId(); 
  List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>();
  long mergeOutputSize = 
    createInMemorySegments(inputs, inMemorySegments, 0);
  int noInMemorySegments = inMemorySegments.size();
  
  InMemoryMapOutput<K, V> mergedMapOutputs = 
    unconditionalReserve(dummyMapId, mergeOutputSize, false);
  
  Writer<K, V> writer = 
    new InMemoryWriter<K, V>(mergedMapOutputs.getArrayStream());
  
  LOG.info("Initiating Memory-to-Memory merge with " + noInMemorySegments +
           " segments of total-size: " + mergeOutputSize);

  RawKeyValueIterator rIter = 
    Merger.merge(jobConf, rfs,
                 (Class<K>)jobConf.getMapOutputKeyClass(),
                 (Class<V>)jobConf.getMapOutputValueClass(),
                 inMemorySegments, inMemorySegments.size(),
                 new Path(reduceId.toString()),
                 (RawComparator<K>)jobConf.getOutputKeyComparator(),
                 reporter, null, null, null);
  Merger.writeFile(rIter, writer, reporter, jobConf);
  writer.close();

  LOG.info(reduceId +  
           " Memory-to-Memory merge of the " + noInMemorySegments +
           " files in-memory complete.");

  // Note the output of the merge
  closeInMemoryMergedFile(mergedMapOutputs);
}

Source File: BackupStore.java From big-c with Apache License 2.0

5 votes

private Writer<K,V> createSpillFile() throws IOException {
  Path tmp =
      new Path(MRJobConfig.OUTPUT + "/backup_" + tid.getId() + "_"
          + (spillNumber++) + ".out");

  LOG.info("Created file: " + tmp);

  file = lDirAlloc.getLocalPathForWrite(tmp.toUri().getPath(), 
      -1, conf);
  FSDataOutputStream out = fs.create(file);
  out = CryptoUtils.wrapIfNecessary(conf, out);
  return new Writer<K, V>(conf, out, null, null, null, null, true);
}

Source File: Merger.java From hadoop-gpu with Apache License 2.0

5 votes

public static <K extends Object, V extends Object>
  void writeFile(RawKeyValueIterator records, Writer<K, V> writer, 
                 Progressable progressable, Configuration conf) 
  throws IOException {
    long progressBar = conf.getLong("mapred.merge.recordsBeforeProgress",
        10000);
    long recordCtr = 0;
    while(records.next()) {
      writer.append(records.getKey(), records.getValue());
      
      if (((recordCtr++) % progressBar) == 0) {
        progressable.progress();
      }
    }
}

Source File: Merger.java From hadoop with Apache License 2.0

5 votes

public static <K extends Object, V extends Object>
  void writeFile(RawKeyValueIterator records, Writer<K, V> writer, 
                 Progressable progressable, Configuration conf) 
  throws IOException {
    long progressBar = conf.getLong(JobContext.RECORDS_BEFORE_PROGRESS,
        10000);
    long recordCtr = 0;
    while(records.next()) {
      writer.append(records.getKey(), records.getValue());
      
      if (((recordCtr++) % progressBar) == 0) {
        progressable.progress();
      }
    }
}

Source File: BackupStore.java From hadoop with Apache License 2.0

5 votes

private Writer<K,V> createSpillFile() throws IOException {
  Path tmp =
      new Path(MRJobConfig.OUTPUT + "/backup_" + tid.getId() + "_"
          + (spillNumber++) + ".out");

  LOG.info("Created file: " + tmp);

  file = lDirAlloc.getLocalPathForWrite(tmp.toUri().getPath(), 
      -1, conf);
  FSDataOutputStream out = fs.create(file);
  out = CryptoUtils.wrapIfNecessary(conf, out);
  return new Writer<K, V>(conf, out, null, null, null, null, true);
}

Source File: Merger.java From big-c with Apache License 2.0

5 votes

public static <K extends Object, V extends Object>
  void writeFile(RawKeyValueIterator records, Writer<K, V> writer, 
                 Progressable progressable, Configuration conf) 
  throws IOException {
    long progressBar = conf.getLong(JobContext.RECORDS_BEFORE_PROGRESS,
        10000);
    long recordCtr = 0;
    while(records.next()) {
      writer.append(records.getKey(), records.getValue());
      
      if (((recordCtr++) % progressBar) == 0) {
        progressable.progress();
      }
    }
}

Source File: ReducePartition.java From RDFS with Apache License 2.0

5 votes

public IndexRecord spill(JobConf job, FSDataOutputStream out,
    Class<K> keyClass, Class<V> valClass, CompressionCodec codec,
    Counter spillCounter) throws IOException {
  IFile.Writer<K, V> writer = null;
  IndexRecord rec = new IndexRecord();
  long segmentStart = out.getPos();
  try {
    writer = new Writer<K, V>(job, out, keyClass, valClass, codec,
        spillCounter);
    // spill directly
    KeyValueSpillIterator kvSortedArray = this.getKeyValueSpillIterator();
    MemoryBlockIndex memBlkIdx = kvSortedArray.next();
    while (memBlkIdx != null) {
      int pos = memBlkIdx.getIndex();
      MemoryBlock memBlk = memBlkIdx.getMemoryBlock();
      writer.append(kvbuffer, memBlk.offsets[pos], memBlk.keyLenArray[pos],
          memBlk.valueLenArray[pos]);
      memBlkIdx = kvSortedArray.next();
    }
  } finally {
    // close the writer
    if (null != writer)
      writer.close();
  }
  rec.startOffset = segmentStart;
  rec.rawLength = writer.getRawLength();
  rec.partLength = writer.getCompressedLength();
  writer = null;
  return rec;
}

Source File: MergeManagerImpl.java From hadoop with Apache License 2.0

5 votes

@Override
public void merge(List<InMemoryMapOutput<K, V>> inputs) throws IOException {
  if (inputs == null || inputs.size() == 0) {
    return;
  }

  TaskAttemptID dummyMapId = inputs.get(0).getMapId(); 
  List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>();
  long mergeOutputSize = 
    createInMemorySegments(inputs, inMemorySegments, 0);
  int noInMemorySegments = inMemorySegments.size();
  
  InMemoryMapOutput<K, V> mergedMapOutputs = 
    unconditionalReserve(dummyMapId, mergeOutputSize, false);
  
  Writer<K, V> writer = 
    new InMemoryWriter<K, V>(mergedMapOutputs.getArrayStream());
  
  LOG.info("Initiating Memory-to-Memory merge with " + noInMemorySegments +
           " segments of total-size: " + mergeOutputSize);

  RawKeyValueIterator rIter = 
    Merger.merge(jobConf, rfs,
                 (Class<K>)jobConf.getMapOutputKeyClass(),
                 (Class<V>)jobConf.getMapOutputValueClass(),
                 inMemorySegments, inMemorySegments.size(),
                 new Path(reduceId.toString()),
                 (RawComparator<K>)jobConf.getOutputKeyComparator(),
                 reporter, null, null, null);
  Merger.writeFile(rIter, writer, reporter, jobConf);
  writer.close();

  LOG.info(reduceId +  
           " Memory-to-Memory merge of the " + noInMemorySegments +
           " files in-memory complete.");

  // Note the output of the merge
  closeInMemoryMergedFile(mergedMapOutputs);
}

Source File: Merger.java From RDFS with Apache License 2.0

5 votes

public static <K extends Object, V extends Object>
  void writeFile(RawKeyValueIterator records, Writer<K, V> writer, 
                 Progressable progressable, Configuration conf) 
  throws IOException {
    long progressBar = conf.getLong("mapred.merge.recordsBeforeProgress",
        10000);
    long recordCtr = 0;
    while(records.next()) {
      writer.append(records.getKey(), records.getValue());
      
      if (((recordCtr++) % progressBar) == 0) {
        progressable.progress();
      }
    }
}

Source File: BlockMapOutputBuffer.java From RDFS with Apache License 2.0

4 votes

public void spillSingleRecord(K key, V value, int part)
    throws IOException {

  ProcResourceValues spillStartProcVals =
      task.getCurrentProcResourceValues();
  long spillStartMilli = System.currentTimeMillis();
  // spill
  FSDataOutputStream out = null;
  long spillBytes = 0;
  try {
    // create spill file
    final SpillRecord spillRec = new SpillRecord(partitions);
    final Path filename =
        task.mapOutputFile.getSpillFileForWrite(getTaskID(),
            numSpills, key.getLength() + value.getLength());
    out = rfs.create(filename);
    IndexRecord rec = new IndexRecord();
    for (int i = 0; i < partitions; ++i) {
      IFile.Writer<K, V> writer = null;
      try {
        long segmentStart = out.getPos();
        // Create a new codec, don't care!
        writer =
            new IFile.Writer<K, V>(job, out, keyClass, valClass,
                codec, task.spilledRecordsCounter);
        if (i == part) {
          final long recordStart = out.getPos();
          writer.append(key, value);
          // Note that our map byte count will not be accurate with
          // compression
          mapOutputByteCounter
              .increment(out.getPos() - recordStart);
        }
        writer.close();

        // record offsets
        rec.startOffset = segmentStart;
        rec.rawLength = writer.getRawLength();
        rec.partLength = writer.getCompressedLength();
        spillBytes += writer.getCompressedLength();
        spillRec.putIndex(rec, i);
        writer = null;
      } catch (IOException e) {
        if (null != writer)
          writer.close();
        throw e;
      }
    }

    if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) {
      // create spill index file
      Path indexFilename =
          task.mapOutputFile.getSpillIndexFileForWrite(getTaskID(),
              numSpills, partitions
                  * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH);
      spillRec.writeToFile(indexFilename, job);
    } else {
      indexCacheList.add(spillRec);
      totalIndexCacheMemory +=
          spillRec.size() * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH;
    }
    
    LOG.info("Finished spill big record " + numBigRecordsSpills);
    ++numBigRecordsSpills;
    ++numSpills;
  } finally {
    if (out != null)
      out.close();
  }

  long spillEndMilli = System.currentTimeMillis();
  ProcResourceValues spillEndProcVals =
      task.getCurrentProcResourceValues();
  mapSpillSortCounter.incCountersPerSpill(spillStartProcVals,
      spillEndProcVals, spillEndMilli - spillStartMilli, spillBytes);
  mapSpillSortCounter.incSpillSingleRecord();
}

Source File: MergeManagerImpl.java From big-c with Apache License 2.0

4 votes

@Override
public void merge(List<InMemoryMapOutput<K,V>> inputs) throws IOException {
  if (inputs == null || inputs.size() == 0) {
    return;
  }
  
  //name this output file same as the name of the first file that is 
  //there in the current list of inmem files (this is guaranteed to
  //be absent on the disk currently. So we don't overwrite a prev. 
  //created spill). Also we need to create the output file now since
  //it is not guaranteed that this file will be present after merge
  //is called (we delete empty files as soon as we see them
  //in the merge method)

  //figure out the mapId 
  TaskAttemptID mapId = inputs.get(0).getMapId();
  TaskID mapTaskId = mapId.getTaskID();

  List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>();
  long mergeOutputSize = 
    createInMemorySegments(inputs, inMemorySegments,0);
  int noInMemorySegments = inMemorySegments.size();

  Path outputPath = 
    mapOutputFile.getInputFileForWrite(mapTaskId,
                                       mergeOutputSize).suffix(
                                           Task.MERGED_OUTPUT_PREFIX);

  FSDataOutputStream out = CryptoUtils.wrapIfNecessary(jobConf, rfs.create(outputPath));
  Writer<K, V> writer = new Writer<K, V>(jobConf, out,
      (Class<K>) jobConf.getMapOutputKeyClass(),
      (Class<V>) jobConf.getMapOutputValueClass(), codec, null, true);

  RawKeyValueIterator rIter = null;
  CompressAwarePath compressAwarePath;
  try {
    LOG.info("Initiating in-memory merge with " + noInMemorySegments + 
             " segments...");
    
    rIter = Merger.merge(jobConf, rfs,
                         (Class<K>)jobConf.getMapOutputKeyClass(),
                         (Class<V>)jobConf.getMapOutputValueClass(),
                         inMemorySegments, inMemorySegments.size(),
                         new Path(reduceId.toString()),
                         (RawComparator<K>)jobConf.getOutputKeyComparator(),
                         reporter, spilledRecordsCounter, null, null);
    
    if (null == combinerClass) {
      Merger.writeFile(rIter, writer, reporter, jobConf);
    } else {
      combineCollector.setWriter(writer);
      combineAndSpill(rIter, reduceCombineInputCounter);
    }
    writer.close();
    compressAwarePath = new CompressAwarePath(outputPath,
        writer.getRawLength(), writer.getCompressedLength());

    LOG.info(reduceId +  
        " Merge of the " + noInMemorySegments +
        " files in-memory complete." +
        " Local file is " + outputPath + " of size " + 
        localFS.getFileStatus(outputPath).getLen());
  } catch (IOException e) { 
    //make sure that we delete the ondisk file that we created 
    //earlier when we invoked cloneFileAttributes
    localFS.delete(outputPath, true);
    throw e;
  }

  // Note the output of the merge
  closeOnDiskFile(compressAwarePath);
}

Source File: MergeManagerImpl.java From big-c with Apache License 2.0

4 votes

@Override
public void merge(List<CompressAwarePath> inputs) throws IOException {
  // sanity check
  if (inputs == null || inputs.isEmpty()) {
    LOG.info("No ondisk files to merge...");
    return;
  }
  
  long approxOutputSize = 0;
  int bytesPerSum = 
    jobConf.getInt("io.bytes.per.checksum", 512);
  
  LOG.info("OnDiskMerger: We have  " + inputs.size() + 
           " map outputs on disk. Triggering merge...");
  
  // 1. Prepare the list of files to be merged. 
  for (CompressAwarePath file : inputs) {
    approxOutputSize += localFS.getFileStatus(file).getLen();
  }

  // add the checksum length
  approxOutputSize += 
    ChecksumFileSystem.getChecksumLength(approxOutputSize, bytesPerSum);

  // 2. Start the on-disk merge process
  Path outputPath = 
    localDirAllocator.getLocalPathForWrite(inputs.get(0).toString(), 
        approxOutputSize, jobConf).suffix(Task.MERGED_OUTPUT_PREFIX);

  FSDataOutputStream out = CryptoUtils.wrapIfNecessary(jobConf, rfs.create(outputPath));
  Writer<K, V> writer = new Writer<K, V>(jobConf, out,
      (Class<K>) jobConf.getMapOutputKeyClass(),
      (Class<V>) jobConf.getMapOutputValueClass(), codec, null, true);

  RawKeyValueIterator iter  = null;
  CompressAwarePath compressAwarePath;
  Path tmpDir = new Path(reduceId.toString());
  try {
    iter = Merger.merge(jobConf, rfs,
                        (Class<K>) jobConf.getMapOutputKeyClass(),
                        (Class<V>) jobConf.getMapOutputValueClass(),
                        codec, inputs.toArray(new Path[inputs.size()]), 
                        true, ioSortFactor, tmpDir, 
                        (RawComparator<K>) jobConf.getOutputKeyComparator(), 
                        reporter, spilledRecordsCounter, null, 
                        mergedMapOutputsCounter, null);

    Merger.writeFile(iter, writer, reporter, jobConf);
    writer.close();
    compressAwarePath = new CompressAwarePath(outputPath,
        writer.getRawLength(), writer.getCompressedLength());
  } catch (IOException e) {
    localFS.delete(outputPath, true);
    throw e;
  }

  closeOnDiskFile(compressAwarePath);

  LOG.info(reduceId +
      " Finished merging " + inputs.size() + 
      " map output files on disk of total-size " + 
      approxOutputSize + "." + 
      " Local output file is " + outputPath + " of size " +
      localFS.getFileStatus(outputPath).getLen());
}

Source File: Task.java From RDFS with Apache License 2.0

4 votes

public synchronized void setWriter(Writer<K, V> writer) {
  this.writer = writer;
}

Source File: MapTask.java From RDFS with Apache License 2.0

4 votes

/**
 * Handles the degenerate case where serialization fails to fit in
 * the in-memory buffer, so we must spill the record from collect
 * directly to a spill file. Consider this "losing".
 */
private void spillSingleRecord(final K key, final V value,
                               int partition) throws IOException {
  long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH;
  FSDataOutputStream out = null;
  try {
    
    long spillStartMilli = System.currentTimeMillis();
    ProcResourceValues spillStartProcVals = getCurrentProcResourceValues();
    long spillBytes = 0;
    
    // create spill file
    final SpillRecord spillRec = new SpillRecord(partitions);
    final Path filename = mapOutputFile.getSpillFileForWrite(getTaskID(),
        numSpills, size);
    out = rfs.create(filename);
    
    // we don't run the combiner for a single record
    IndexRecord rec = new IndexRecord();
    for (int i = 0; i < partitions; ++i) {
      IFile.Writer<K, V> writer = null;
      try {
        long segmentStart = out.getPos();
        // Create a new codec, don't care!
        writer = new IFile.Writer<K,V>(job, out, keyClass, valClass, codec,
                                        spilledRecordsCounter);

        if (i == partition) {
          final long recordStart = out.getPos();
          writer.append(key, value);
          // Note that our map byte count will not be accurate with
          // compression
          mapOutputByteCounter.increment(out.getPos() - recordStart);
        }
        writer.close();

        // record offsets
        rec.startOffset = segmentStart;
        rec.rawLength = writer.getRawLength();
        rec.partLength = writer.getCompressedLength();
        spillBytes += writer.getCompressedLength();
        spillRec.putIndex(rec, i);

        writer = null;
      } catch (IOException e) {
        if (null != writer) writer.close();
        throw e;
      }
    }
    if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) {
      // create spill index file
      Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(
          getTaskID(), numSpills,
          partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
      spillRec.writeToFile(indexFilename, job);
    } else {
      indexCacheList.add(spillRec);
      totalIndexCacheMemory +=
        spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
    }
    
    long spillEndMilli = System.currentTimeMillis();
    ProcResourceValues spillEndProcVals = getCurrentProcResourceValues();        
    spillSortCounters.incCountersPerSpill(spillStartProcVals,
        spillEndProcVals, spillEndMilli - spillStartMilli, spillBytes);
    ++numSpills;
  } finally {
    if (out != null) out.close();
  }
}

Source File: MapTask.java From hadoop-gpu with Apache License 2.0

4 votes

/**
 * Handles the degenerate case where serialization fails to fit in
 * the in-memory buffer, so we must spill the record from collect
 * directly to a spill file. Consider this "losing".
 */
private void spillSingleRecord(final K key, final V value,
                               int partition) throws IOException {
  long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH;
  FSDataOutputStream out = null;
  try {
    // create spill file
    final SpillRecord spillRec = new SpillRecord(partitions);
    final Path filename = mapOutputFile.getSpillFileForWrite(getTaskID(),
        numSpills, size);
    out = rfs.create(filename);
    
    // we don't run the combiner for a single record
    IndexRecord rec = new IndexRecord();
    for (int i = 0; i < partitions; ++i) {
      IFile.Writer<K, V> writer = null;
      try {
        long segmentStart = out.getPos();
        // Create a new codec, don't care!
        writer = new IFile.Writer<K,V>(job, out, keyClass, valClass, codec,
                                        spilledRecordsCounter);

        if (i == partition) {
          final long recordStart = out.getPos();
          writer.append(key, value);
          // Note that our map byte count will not be accurate with
          // compression
          mapOutputByteCounter.increment(out.getPos() - recordStart);
        }
        writer.close();

        // record offsets
        rec.startOffset = segmentStart;
        rec.rawLength = writer.getRawLength();
        rec.partLength = writer.getCompressedLength();
        spillRec.putIndex(rec, i);

        writer = null;
      } catch (IOException e) {
        if (null != writer) writer.close();
        throw e;
      }
    }
    if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) {
      // create spill index file
      Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(
          getTaskID(), numSpills,
          partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
      spillRec.writeToFile(indexFilename, job);
    } else {
      indexCacheList.add(spillRec);
      totalIndexCacheMemory +=
        spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
    }
    ++numSpills;
  } finally {
    if (out != null) out.close();
  }
}

Source File: Task.java From hadoop-gpu with Apache License 2.0

4 votes

public synchronized void setWriter(Writer<K, V> writer) {
  this.writer = writer;
}

Source File: MapTask.java From hadoop-gpu with Apache License 2.0

4 votes

private void sortAndSpill() throws IOException, ClassNotFoundException,
                                   InterruptedException {
  //approximate the length of the output file to be the length of the
  //buffer + header lengths for the partitions
  long size = (bufend >= bufstart
      ? bufend - bufstart
      : (bufvoid - bufend) + bufstart) +
              partitions * APPROX_HEADER_LENGTH;
  FSDataOutputStream out = null;
  try {
    // create spill file
    final SpillRecord spillRec = new SpillRecord(partitions);
    final Path filename = mapOutputFile.getSpillFileForWrite(getTaskID(),
        numSpills, size);
    out = rfs.create(filename);

    final int endPosition = (kvend > kvstart)
      ? kvend
      : kvoffsets.length + kvend;
    sorter.sort(MapOutputBuffer.this, kvstart, endPosition, reporter);
    int spindex = kvstart;
    IndexRecord rec = new IndexRecord();
    InMemValBytes value = new InMemValBytes();
    for (int i = 0; i < partitions; ++i) {
      IFile.Writer<K, V> writer = null;
      try {
        long segmentStart = out.getPos();
        writer = new Writer<K, V>(job, out, keyClass, valClass, codec,
                                  spilledRecordsCounter);
        if (combinerRunner == null) {
          // spill directly
          DataInputBuffer key = new DataInputBuffer();
          while (spindex < endPosition &&
              kvindices[kvoffsets[spindex % kvoffsets.length]
                        + PARTITION] == i) {
            final int kvoff = kvoffsets[spindex % kvoffsets.length];
            getVBytesForOffset(kvoff, value);
            key.reset(kvbuffer, kvindices[kvoff + KEYSTART],
                      (kvindices[kvoff + VALSTART] - 
                       kvindices[kvoff + KEYSTART]));
            writer.append(key, value);
            ++spindex;
          }
        } else {
          int spstart = spindex;
          while (spindex < endPosition &&
              kvindices[kvoffsets[spindex % kvoffsets.length]
                        + PARTITION] == i) {
            ++spindex;
          }
          // Note: we would likeo to avoid the combiner if we've fewer
          // than some threshold of records for a partition
          if (spstart != spindex) {
            combineCollector.setWriter(writer);
            RawKeyValueIterator kvIter =
              new MRResultIterator(spstart, spindex);
            combinerRunner.combine(kvIter, combineCollector);
          }
        }

        // close the writer
        writer.close();

        // record offsets
        rec.startOffset = segmentStart;
        rec.rawLength = writer.getRawLength();
        rec.partLength = writer.getCompressedLength();
        spillRec.putIndex(rec, i);

        writer = null;
      } finally {
        if (null != writer) writer.close();
      }
    }

    if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) {
      // create spill index file
      Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(
          getTaskID(), numSpills,
          partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
      spillRec.writeToFile(indexFilename, job);
    } else {
      indexCacheList.add(spillRec);
      totalIndexCacheMemory +=
        spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
    }
    LOG.info("Finished spill " + numSpills);
    ++numSpills;
  } finally {
    if (out != null) out.close();
  }
}

Source File: TestPipeApplication.java From hadoop with Apache License 2.0

4 votes

/**
 * test PipesMapRunner    test the transfer data from reader
 *
 * @throws Exception
 */
@Test
public void testRunner() throws Exception {

  // clean old password files
  File[] psw = cleanTokenPasswordFile();
  try {
    RecordReader<FloatWritable, NullWritable> rReader = new ReaderPipesMapRunner();
    JobConf conf = new JobConf();
    conf.set(Submitter.IS_JAVA_RR, "true");
    // for stdour and stderror

    conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName);

    CombineOutputCollector<IntWritable, Text> output = new CombineOutputCollector<IntWritable, Text>(
            new Counters.Counter(), new Progress());
    FileSystem fs = new RawLocalFileSystem();
    fs.setConf(conf);
    Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create(
            new Path(workSpace + File.separator + "outfile")), IntWritable.class,
            Text.class, null, null, true);
    output.setWriter(wr);
    // stub for client
    File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationRunnableStub");

    conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());
    // token for authorization
    Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>(
            "user".getBytes(), "password".getBytes(), new Text("kind"), new Text(
            "service"));
    TokenCache.setJobToken(token,  conf.getCredentials());
    conf.setBoolean(MRJobConfig.SKIP_RECORDS, true);
    TestTaskReporter reporter = new TestTaskReporter();
    PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text> runner = new PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text>();

    initStdOut(conf);

    runner.configure(conf);
    runner.run(rReader, output, reporter);

    String stdOut = readStdOut(conf);

    // test part of translated data. As common file for client and test -
    // clients stdOut
    // check version
    assertTrue(stdOut.contains("CURRENT_PROTOCOL_VERSION:0"));
    // check key and value classes
    assertTrue(stdOut
            .contains("Key class:org.apache.hadoop.io.FloatWritable"));
    assertTrue(stdOut
            .contains("Value class:org.apache.hadoop.io.NullWritable"));
    // test have sent all data from reader
    assertTrue(stdOut.contains("value:0.0"));
    assertTrue(stdOut.contains("value:9.0"));

  } finally {
    if (psw != null) {
      // remove password files
      for (File file : psw) {
        file.deleteOnExit();
      }
    }

  }
}

Source File: MapTask.java From big-c with Apache License 2.0

4 votes

/**
 * Handles the degenerate case where serialization fails to fit in
 * the in-memory buffer, so we must spill the record from collect
 * directly to a spill file. Consider this "losing".
 */
private void spillSingleRecord(final K key, final V value,
                               int partition) throws IOException {
  long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH;
  FSDataOutputStream out = null;
  try {
    // create spill file
    final SpillRecord spillRec = new SpillRecord(partitions);
    final Path filename =
        mapOutputFile.getSpillFileForWrite(numSpills, size);
    out = rfs.create(filename);

    // we don't run the combiner for a single record
    IndexRecord rec = new IndexRecord();
    for (int i = 0; i < partitions; ++i) {
      IFile.Writer<K, V> writer = null;
      try {
        long segmentStart = out.getPos();
        // Create a new codec, don't care!
        FSDataOutputStream partitionOut = CryptoUtils.wrapIfNecessary(job, out);
        writer = new IFile.Writer<K,V>(job, partitionOut, keyClass, valClass, codec,
                                        spilledRecordsCounter);

        if (i == partition) {
          final long recordStart = out.getPos();
          writer.append(key, value);
          // Note that our map byte count will not be accurate with
          // compression
          mapOutputByteCounter.increment(out.getPos() - recordStart);
        }
        writer.close();

        // record offsets
        rec.startOffset = segmentStart;
        rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job);
        rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job);
        spillRec.putIndex(rec, i);

        writer = null;
      } catch (IOException e) {
        if (null != writer) writer.close();
        throw e;
      }
    }
    if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
      // create spill index file
      Path indexFilename =
          mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions
              * MAP_OUTPUT_INDEX_RECORD_LENGTH);
      spillRec.writeToFile(indexFilename, job);
    } else {
      indexCacheList.add(spillRec);
      totalIndexCacheMemory +=
        spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
    }
    ++numSpills;
  } finally {
    if (out != null) out.close();
  }
}

Source File: MapTask.java From big-c with Apache License 2.0

4 votes

private void sortAndSpill() throws IOException, ClassNotFoundException,
                                   InterruptedException {
  //approximate the length of the output file to be the length of the
  //buffer + header lengths for the partitions
  final long size = distanceTo(bufstart, bufend, bufvoid) +
              partitions * APPROX_HEADER_LENGTH;
  FSDataOutputStream out = null;
  try {
    // create spill file
    final SpillRecord spillRec = new SpillRecord(partitions);
    final Path filename =
        mapOutputFile.getSpillFileForWrite(numSpills, size);
    out = rfs.create(filename);

    final int mstart = kvend / NMETA;
    final int mend = 1 + // kvend is a valid record
      (kvstart >= kvend
      ? kvstart
      : kvmeta.capacity() + kvstart) / NMETA;
    sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);
    int spindex = mstart;
    final IndexRecord rec = new IndexRecord();
    final InMemValBytes value = new InMemValBytes();
    for (int i = 0; i < partitions; ++i) {
      IFile.Writer<K, V> writer = null;
      try {
        long segmentStart = out.getPos();
        FSDataOutputStream partitionOut = CryptoUtils.wrapIfNecessary(job, out);
        writer = new Writer<K, V>(job, partitionOut, keyClass, valClass, codec,
                                  spilledRecordsCounter);
        if (combinerRunner == null) {
          // spill directly
          DataInputBuffer key = new DataInputBuffer();
          while (spindex < mend &&
              kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {
            final int kvoff = offsetFor(spindex % maxRec);
            int keystart = kvmeta.get(kvoff + KEYSTART);
            int valstart = kvmeta.get(kvoff + VALSTART);
            key.reset(kvbuffer, keystart, valstart - keystart);
            getVBytesForOffset(kvoff, value);
            writer.append(key, value);
            ++spindex;
          }
        } else {
          int spstart = spindex;
          while (spindex < mend &&
              kvmeta.get(offsetFor(spindex % maxRec)
                        + PARTITION) == i) {
            ++spindex;
          }
          // Note: we would like to avoid the combiner if we've fewer
          // than some threshold of records for a partition
          if (spstart != spindex) {
            combineCollector.setWriter(writer);
            RawKeyValueIterator kvIter =
              new MRResultIterator(spstart, spindex);
            combinerRunner.combine(kvIter, combineCollector);
          }
        }

        // close the writer
        writer.close();

        // record offsets
        rec.startOffset = segmentStart;
        rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job);
        rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job);
        spillRec.putIndex(rec, i);

        writer = null;
      } finally {
        if (null != writer) writer.close();
      }
    }

    if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
      // create spill index file
      Path indexFilename =
          mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions
              * MAP_OUTPUT_INDEX_RECORD_LENGTH);
      spillRec.writeToFile(indexFilename, job);
    } else {
      indexCacheList.add(spillRec);
      totalIndexCacheMemory +=
        spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
    }
    LOG.info("Finished spill " + numSpills);
    ++numSpills;
  } finally {
    if (out != null) out.close();
  }
}

Source File: Task.java From big-c with Apache License 2.0

4 votes

public synchronized void setWriter(Writer<K, V> writer) {
  this.writer = writer;
}

Source File: TestPipeApplication.java From big-c with Apache License 2.0

4 votes

public synchronized void setWriter(Writer<K, V> writer) {
  this.writer = writer;
}

Source File: TestPipeApplication.java From big-c with Apache License 2.0

4 votes

/**
 * test org.apache.hadoop.mapred.pipes.Application
 * test a internal functions: MessageType.REGISTER_COUNTER,  INCREMENT_COUNTER, STATUS, PROGRESS...
 *
 * @throws Throwable
 */

@Test
public void testApplication() throws Throwable {
  JobConf conf = new JobConf();

  RecordReader<FloatWritable, NullWritable> rReader = new Reader();

  // client for test
  File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationStub");

  TestTaskReporter reporter = new TestTaskReporter();

  File[] psw = cleanTokenPasswordFile();
  try {

    conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName);
    conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());

    // token for authorization
    Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>(
            "user".getBytes(), "password".getBytes(), new Text("kind"), new Text(
            "service"));

    TokenCache.setJobToken(token, conf.getCredentials());
    FakeCollector output = new FakeCollector(new Counters.Counter(),
            new Progress());
    FileSystem fs = new RawLocalFileSystem();
    fs.setConf(conf);
    Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create(
            new Path(workSpace.getAbsolutePath() + File.separator + "outfile")),
            IntWritable.class, Text.class, null, null, true);
    output.setWriter(wr);
    conf.set(Submitter.PRESERVE_COMMANDFILE, "true");

    initStdOut(conf);

    Application<WritableComparable<IntWritable>, Writable, IntWritable, Text> application = new Application<WritableComparable<IntWritable>, Writable, IntWritable, Text>(
            conf, rReader, output, reporter, IntWritable.class, Text.class);
    application.getDownlink().flush();

    application.getDownlink().mapItem(new IntWritable(3), new Text("txt"));

    application.getDownlink().flush();

    application.waitForFinish();

    wr.close();

    // test getDownlink().mapItem();
    String stdOut = readStdOut(conf);
    assertTrue(stdOut.contains("key:3"));
    assertTrue(stdOut.contains("value:txt"));

    // reporter test counter, and status should be sended
    // test MessageType.REGISTER_COUNTER and INCREMENT_COUNTER
    assertEquals(1.0, reporter.getProgress(), 0.01);
    assertNotNull(reporter.getCounter("group", "name"));
    // test status MessageType.STATUS
    assertEquals(reporter.getStatus(), "PROGRESS");
    stdOut = readFile(new File(workSpace.getAbsolutePath() + File.separator
            + "outfile"));
    // check MessageType.PROGRESS
    assertEquals(0.55f, rReader.getProgress(), 0.001);
    application.getDownlink().close();
    // test MessageType.OUTPUT
    Entry<IntWritable, Text> entry = output.getCollect().entrySet()
            .iterator().next();
    assertEquals(123, entry.getKey().get());
    assertEquals("value", entry.getValue().toString());
    try {
      // try to abort
      application.abort(new Throwable());
      fail();
    } catch (IOException e) {
      // abort works ?
      assertEquals("pipe child exception", e.getMessage());
    }
  } finally {
    if (psw != null) {
      // remove password files
      for (File file : psw) {
        file.deleteOnExit();
      }
    }
  }
}

Source File: TestPipeApplication.java From big-c with Apache License 2.0

4 votes

/**
 * test PipesMapRunner    test the transfer data from reader
 *
 * @throws Exception
 */
@Test
public void testRunner() throws Exception {

  // clean old password files
  File[] psw = cleanTokenPasswordFile();
  try {
    RecordReader<FloatWritable, NullWritable> rReader = new ReaderPipesMapRunner();
    JobConf conf = new JobConf();
    conf.set(Submitter.IS_JAVA_RR, "true");
    // for stdour and stderror

    conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName);

    CombineOutputCollector<IntWritable, Text> output = new CombineOutputCollector<IntWritable, Text>(
            new Counters.Counter(), new Progress());
    FileSystem fs = new RawLocalFileSystem();
    fs.setConf(conf);
    Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create(
            new Path(workSpace + File.separator + "outfile")), IntWritable.class,
            Text.class, null, null, true);
    output.setWriter(wr);
    // stub for client
    File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationRunnableStub");

    conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());
    // token for authorization
    Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>(
            "user".getBytes(), "password".getBytes(), new Text("kind"), new Text(
            "service"));
    TokenCache.setJobToken(token,  conf.getCredentials());
    conf.setBoolean(MRJobConfig.SKIP_RECORDS, true);
    TestTaskReporter reporter = new TestTaskReporter();
    PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text> runner = new PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text>();

    initStdOut(conf);

    runner.configure(conf);
    runner.run(rReader, output, reporter);

    String stdOut = readStdOut(conf);

    // test part of translated data. As common file for client and test -
    // clients stdOut
    // check version
    assertTrue(stdOut.contains("CURRENT_PROTOCOL_VERSION:0"));
    // check key and value classes
    assertTrue(stdOut
            .contains("Key class:org.apache.hadoop.io.FloatWritable"));
    assertTrue(stdOut
            .contains("Value class:org.apache.hadoop.io.NullWritable"));
    // test have sent all data from reader
    assertTrue(stdOut.contains("value:0.0"));
    assertTrue(stdOut.contains("value:9.0"));

  } finally {
    if (psw != null) {
      // remove password files
      for (File file : psw) {
        file.deleteOnExit();
      }
    }

  }
}

Source File: MergeManagerImpl.java From hadoop with Apache License 2.0

4 votes

@Override
public void merge(List<CompressAwarePath> inputs) throws IOException {
  // sanity check
  if (inputs == null || inputs.isEmpty()) {
    LOG.info("No ondisk files to merge...");
    return;
  }
  
  long approxOutputSize = 0;
  int bytesPerSum = 
    jobConf.getInt("io.bytes.per.checksum", 512);
  
  LOG.info("OnDiskMerger: We have  " + inputs.size() + 
           " map outputs on disk. Triggering merge...");
  
  // 1. Prepare the list of files to be merged. 
  for (CompressAwarePath file : inputs) {
    approxOutputSize += localFS.getFileStatus(file).getLen();
  }

  // add the checksum length
  approxOutputSize += 
    ChecksumFileSystem.getChecksumLength(approxOutputSize, bytesPerSum);

  // 2. Start the on-disk merge process
  Path outputPath = 
    localDirAllocator.getLocalPathForWrite(inputs.get(0).toString(), 
        approxOutputSize, jobConf).suffix(Task.MERGED_OUTPUT_PREFIX);

  FSDataOutputStream out = CryptoUtils.wrapIfNecessary(jobConf, rfs.create(outputPath));
  Writer<K, V> writer = new Writer<K, V>(jobConf, out,
      (Class<K>) jobConf.getMapOutputKeyClass(),
      (Class<V>) jobConf.getMapOutputValueClass(), codec, null, true);

  RawKeyValueIterator iter  = null;
  CompressAwarePath compressAwarePath;
  Path tmpDir = new Path(reduceId.toString());
  try {
    iter = Merger.merge(jobConf, rfs,
                        (Class<K>) jobConf.getMapOutputKeyClass(),
                        (Class<V>) jobConf.getMapOutputValueClass(),
                        codec, inputs.toArray(new Path[inputs.size()]), 
                        true, ioSortFactor, tmpDir, 
                        (RawComparator<K>) jobConf.getOutputKeyComparator(), 
                        reporter, spilledRecordsCounter, null, 
                        mergedMapOutputsCounter, null);

    Merger.writeFile(iter, writer, reporter, jobConf);
    writer.close();
    compressAwarePath = new CompressAwarePath(outputPath,
        writer.getRawLength(), writer.getCompressedLength());
  } catch (IOException e) {
    localFS.delete(outputPath, true);
    throw e;
  }

  closeOnDiskFile(compressAwarePath);

  LOG.info(reduceId +
      " Finished merging " + inputs.size() + 
      " map output files on disk of total-size " + 
      approxOutputSize + "." + 
      " Local output file is " + outputPath + " of size " +
      localFS.getFileStatus(outputPath).getLen());
}

Source File: MergeManagerImpl.java From hadoop with Apache License 2.0

4 votes

@Override
public void merge(List<InMemoryMapOutput<K,V>> inputs) throws IOException {
  if (inputs == null || inputs.size() == 0) {
    return;
  }
  
  //name this output file same as the name of the first file that is 
  //there in the current list of inmem files (this is guaranteed to
  //be absent on the disk currently. So we don't overwrite a prev. 
  //created spill). Also we need to create the output file now since
  //it is not guaranteed that this file will be present after merge
  //is called (we delete empty files as soon as we see them
  //in the merge method)

  //figure out the mapId 
  TaskAttemptID mapId = inputs.get(0).getMapId();
  TaskID mapTaskId = mapId.getTaskID();

  List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>();
  long mergeOutputSize = 
    createInMemorySegments(inputs, inMemorySegments,0);
  int noInMemorySegments = inMemorySegments.size();

  Path outputPath = 
    mapOutputFile.getInputFileForWrite(mapTaskId,
                                       mergeOutputSize).suffix(
                                           Task.MERGED_OUTPUT_PREFIX);

  FSDataOutputStream out = CryptoUtils.wrapIfNecessary(jobConf, rfs.create(outputPath));
  Writer<K, V> writer = new Writer<K, V>(jobConf, out,
      (Class<K>) jobConf.getMapOutputKeyClass(),
      (Class<V>) jobConf.getMapOutputValueClass(), codec, null, true);

  RawKeyValueIterator rIter = null;
  CompressAwarePath compressAwarePath;
  try {
    LOG.info("Initiating in-memory merge with " + noInMemorySegments + 
             " segments...");
    
    rIter = Merger.merge(jobConf, rfs,
                         (Class<K>)jobConf.getMapOutputKeyClass(),
                         (Class<V>)jobConf.getMapOutputValueClass(),
                         inMemorySegments, inMemorySegments.size(),
                         new Path(reduceId.toString()),
                         (RawComparator<K>)jobConf.getOutputKeyComparator(),
                         reporter, spilledRecordsCounter, null, null);
    
    if (null == combinerClass) {
      Merger.writeFile(rIter, writer, reporter, jobConf);
    } else {
      combineCollector.setWriter(writer);
      combineAndSpill(rIter, reduceCombineInputCounter);
    }
    writer.close();
    compressAwarePath = new CompressAwarePath(outputPath,
        writer.getRawLength(), writer.getCompressedLength());

    LOG.info(reduceId +  
        " Merge of the " + noInMemorySegments +
        " files in-memory complete." +
        " Local file is " + outputPath + " of size " + 
        localFS.getFileStatus(outputPath).getLen());
  } catch (IOException e) { 
    //make sure that we delete the ondisk file that we created 
    //earlier when we invoked cloneFileAttributes
    localFS.delete(outputPath, true);
    throw e;
  }

  // Note the output of the merge
  closeOnDiskFile(compressAwarePath);
}

org.apache.hadoop.mapred.IFile.Writer Java Examples