org.apache.hadoop.mapred.IFile.Writer Java Examples
The following examples show how to use
org.apache.hadoop.mapred.IFile.Writer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCombineOutputCollector.java From hadoop with Apache License 2.0 | 6 votes |
@Test public void testCustomCollect() throws Throwable { //mock creation TaskReporter mockTaskReporter = mock(TaskReporter.class); @SuppressWarnings("unchecked") Writer<String, Integer> mockWriter = mock(Writer.class); Configuration conf = new Configuration(); conf.set(MRJobConfig.COMBINE_RECORDS_BEFORE_PROGRESS, "2"); coc = new CombineOutputCollector<String, Integer>(outCounter, mockTaskReporter, conf); coc.setWriter(mockWriter); verify(mockTaskReporter, never()).progress(); coc.collect("dummy", 1); verify(mockTaskReporter, never()).progress(); coc.collect("dummy", 2); verify(mockTaskReporter, times(1)).progress(); }
Example #2
Source File: TestCombineOutputCollector.java From hadoop with Apache License 2.0 | 6 votes |
@Test public void testDefaultCollect() throws Throwable { //mock creation TaskReporter mockTaskReporter = mock(TaskReporter.class); @SuppressWarnings("unchecked") Writer<String, Integer> mockWriter = mock(Writer.class); Configuration conf = new Configuration(); coc = new CombineOutputCollector<String, Integer>(outCounter, mockTaskReporter, conf); coc.setWriter(mockWriter); verify(mockTaskReporter, never()).progress(); for(int i = 0; i < Task.DEFAULT_COMBINE_RECORDS_BEFORE_PROGRESS; i++) { coc.collect("dummy", i); } verify(mockTaskReporter, times(1)).progress(); for(int i = 0; i < Task.DEFAULT_COMBINE_RECORDS_BEFORE_PROGRESS; i++) { coc.collect("dummy", i); } verify(mockTaskReporter, times(2)).progress(); }
Example #3
Source File: TestCombineOutputCollector.java From big-c with Apache License 2.0 | 6 votes |
@Test public void testDefaultCollect() throws Throwable { //mock creation TaskReporter mockTaskReporter = mock(TaskReporter.class); @SuppressWarnings("unchecked") Writer<String, Integer> mockWriter = mock(Writer.class); Configuration conf = new Configuration(); coc = new CombineOutputCollector<String, Integer>(outCounter, mockTaskReporter, conf); coc.setWriter(mockWriter); verify(mockTaskReporter, never()).progress(); for(int i = 0; i < Task.DEFAULT_COMBINE_RECORDS_BEFORE_PROGRESS; i++) { coc.collect("dummy", i); } verify(mockTaskReporter, times(1)).progress(); for(int i = 0; i < Task.DEFAULT_COMBINE_RECORDS_BEFORE_PROGRESS; i++) { coc.collect("dummy", i); } verify(mockTaskReporter, times(2)).progress(); }
Example #4
Source File: TestCombineOutputCollector.java From big-c with Apache License 2.0 | 6 votes |
@Test public void testCustomCollect() throws Throwable { //mock creation TaskReporter mockTaskReporter = mock(TaskReporter.class); @SuppressWarnings("unchecked") Writer<String, Integer> mockWriter = mock(Writer.class); Configuration conf = new Configuration(); conf.set(MRJobConfig.COMBINE_RECORDS_BEFORE_PROGRESS, "2"); coc = new CombineOutputCollector<String, Integer>(outCounter, mockTaskReporter, conf); coc.setWriter(mockWriter); verify(mockTaskReporter, never()).progress(); coc.collect("dummy", 1); verify(mockTaskReporter, never()).progress(); coc.collect("dummy", 2); verify(mockTaskReporter, times(1)).progress(); }
Example #5
Source File: MergeManagerImpl.java From big-c with Apache License 2.0 | 5 votes |
@Override public void merge(List<InMemoryMapOutput<K, V>> inputs) throws IOException { if (inputs == null || inputs.size() == 0) { return; } TaskAttemptID dummyMapId = inputs.get(0).getMapId(); List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>(); long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments, 0); int noInMemorySegments = inMemorySegments.size(); InMemoryMapOutput<K, V> mergedMapOutputs = unconditionalReserve(dummyMapId, mergeOutputSize, false); Writer<K, V> writer = new InMemoryWriter<K, V>(mergedMapOutputs.getArrayStream()); LOG.info("Initiating Memory-to-Memory merge with " + noInMemorySegments + " segments of total-size: " + mergeOutputSize); RawKeyValueIterator rIter = Merger.merge(jobConf, rfs, (Class<K>)jobConf.getMapOutputKeyClass(), (Class<V>)jobConf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceId.toString()), (RawComparator<K>)jobConf.getOutputKeyComparator(), reporter, null, null, null); Merger.writeFile(rIter, writer, reporter, jobConf); writer.close(); LOG.info(reduceId + " Memory-to-Memory merge of the " + noInMemorySegments + " files in-memory complete."); // Note the output of the merge closeInMemoryMergedFile(mergedMapOutputs); }
Example #6
Source File: BackupStore.java From big-c with Apache License 2.0 | 5 votes |
private Writer<K,V> createSpillFile() throws IOException { Path tmp = new Path(MRJobConfig.OUTPUT + "/backup_" + tid.getId() + "_" + (spillNumber++) + ".out"); LOG.info("Created file: " + tmp); file = lDirAlloc.getLocalPathForWrite(tmp.toUri().getPath(), -1, conf); FSDataOutputStream out = fs.create(file); out = CryptoUtils.wrapIfNecessary(conf, out); return new Writer<K, V>(conf, out, null, null, null, null, true); }
Example #7
Source File: Merger.java From hadoop-gpu with Apache License 2.0 | 5 votes |
public static <K extends Object, V extends Object> void writeFile(RawKeyValueIterator records, Writer<K, V> writer, Progressable progressable, Configuration conf) throws IOException { long progressBar = conf.getLong("mapred.merge.recordsBeforeProgress", 10000); long recordCtr = 0; while(records.next()) { writer.append(records.getKey(), records.getValue()); if (((recordCtr++) % progressBar) == 0) { progressable.progress(); } } }
Example #8
Source File: Merger.java From hadoop with Apache License 2.0 | 5 votes |
public static <K extends Object, V extends Object> void writeFile(RawKeyValueIterator records, Writer<K, V> writer, Progressable progressable, Configuration conf) throws IOException { long progressBar = conf.getLong(JobContext.RECORDS_BEFORE_PROGRESS, 10000); long recordCtr = 0; while(records.next()) { writer.append(records.getKey(), records.getValue()); if (((recordCtr++) % progressBar) == 0) { progressable.progress(); } } }
Example #9
Source File: BackupStore.java From hadoop with Apache License 2.0 | 5 votes |
private Writer<K,V> createSpillFile() throws IOException { Path tmp = new Path(MRJobConfig.OUTPUT + "/backup_" + tid.getId() + "_" + (spillNumber++) + ".out"); LOG.info("Created file: " + tmp); file = lDirAlloc.getLocalPathForWrite(tmp.toUri().getPath(), -1, conf); FSDataOutputStream out = fs.create(file); out = CryptoUtils.wrapIfNecessary(conf, out); return new Writer<K, V>(conf, out, null, null, null, null, true); }
Example #10
Source File: Merger.java From big-c with Apache License 2.0 | 5 votes |
public static <K extends Object, V extends Object> void writeFile(RawKeyValueIterator records, Writer<K, V> writer, Progressable progressable, Configuration conf) throws IOException { long progressBar = conf.getLong(JobContext.RECORDS_BEFORE_PROGRESS, 10000); long recordCtr = 0; while(records.next()) { writer.append(records.getKey(), records.getValue()); if (((recordCtr++) % progressBar) == 0) { progressable.progress(); } } }
Example #11
Source File: ReducePartition.java From RDFS with Apache License 2.0 | 5 votes |
public IndexRecord spill(JobConf job, FSDataOutputStream out, Class<K> keyClass, Class<V> valClass, CompressionCodec codec, Counter spillCounter) throws IOException { IFile.Writer<K, V> writer = null; IndexRecord rec = new IndexRecord(); long segmentStart = out.getPos(); try { writer = new Writer<K, V>(job, out, keyClass, valClass, codec, spillCounter); // spill directly KeyValueSpillIterator kvSortedArray = this.getKeyValueSpillIterator(); MemoryBlockIndex memBlkIdx = kvSortedArray.next(); while (memBlkIdx != null) { int pos = memBlkIdx.getIndex(); MemoryBlock memBlk = memBlkIdx.getMemoryBlock(); writer.append(kvbuffer, memBlk.offsets[pos], memBlk.keyLenArray[pos], memBlk.valueLenArray[pos]); memBlkIdx = kvSortedArray.next(); } } finally { // close the writer if (null != writer) writer.close(); } rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); writer = null; return rec; }
Example #12
Source File: MergeManagerImpl.java From hadoop with Apache License 2.0 | 5 votes |
@Override public void merge(List<InMemoryMapOutput<K, V>> inputs) throws IOException { if (inputs == null || inputs.size() == 0) { return; } TaskAttemptID dummyMapId = inputs.get(0).getMapId(); List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>(); long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments, 0); int noInMemorySegments = inMemorySegments.size(); InMemoryMapOutput<K, V> mergedMapOutputs = unconditionalReserve(dummyMapId, mergeOutputSize, false); Writer<K, V> writer = new InMemoryWriter<K, V>(mergedMapOutputs.getArrayStream()); LOG.info("Initiating Memory-to-Memory merge with " + noInMemorySegments + " segments of total-size: " + mergeOutputSize); RawKeyValueIterator rIter = Merger.merge(jobConf, rfs, (Class<K>)jobConf.getMapOutputKeyClass(), (Class<V>)jobConf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceId.toString()), (RawComparator<K>)jobConf.getOutputKeyComparator(), reporter, null, null, null); Merger.writeFile(rIter, writer, reporter, jobConf); writer.close(); LOG.info(reduceId + " Memory-to-Memory merge of the " + noInMemorySegments + " files in-memory complete."); // Note the output of the merge closeInMemoryMergedFile(mergedMapOutputs); }
Example #13
Source File: Merger.java From RDFS with Apache License 2.0 | 5 votes |
public static <K extends Object, V extends Object> void writeFile(RawKeyValueIterator records, Writer<K, V> writer, Progressable progressable, Configuration conf) throws IOException { long progressBar = conf.getLong("mapred.merge.recordsBeforeProgress", 10000); long recordCtr = 0; while(records.next()) { writer.append(records.getKey(), records.getValue()); if (((recordCtr++) % progressBar) == 0) { progressable.progress(); } } }
Example #14
Source File: BlockMapOutputBuffer.java From RDFS with Apache License 2.0 | 4 votes |
public void spillSingleRecord(K key, V value, int part) throws IOException { ProcResourceValues spillStartProcVals = task.getCurrentProcResourceValues(); long spillStartMilli = System.currentTimeMillis(); // spill FSDataOutputStream out = null; long spillBytes = 0; try { // create spill file final SpillRecord spillRec = new SpillRecord(partitions); final Path filename = task.mapOutputFile.getSpillFileForWrite(getTaskID(), numSpills, key.getLength() + value.getLength()); out = rfs.create(filename); IndexRecord rec = new IndexRecord(); for (int i = 0; i < partitions; ++i) { IFile.Writer<K, V> writer = null; try { long segmentStart = out.getPos(); // Create a new codec, don't care! writer = new IFile.Writer<K, V>(job, out, keyClass, valClass, codec, task.spilledRecordsCounter); if (i == part) { final long recordStart = out.getPos(); writer.append(key, value); // Note that our map byte count will not be accurate with // compression mapOutputByteCounter .increment(out.getPos() - recordStart); } writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); spillBytes += writer.getCompressedLength(); spillRec.putIndex(rec, i); writer = null; } catch (IOException e) { if (null != writer) writer.close(); throw e; } } if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) { // create spill index file Path indexFilename = task.mapOutputFile.getSpillIndexFileForWrite(getTaskID(), numSpills, partitions * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MapTask.MAP_OUTPUT_INDEX_RECORD_LENGTH; } LOG.info("Finished spill big record " + numBigRecordsSpills); ++numBigRecordsSpills; ++numSpills; } finally { if (out != null) out.close(); } long spillEndMilli = System.currentTimeMillis(); ProcResourceValues spillEndProcVals = task.getCurrentProcResourceValues(); mapSpillSortCounter.incCountersPerSpill(spillStartProcVals, spillEndProcVals, spillEndMilli - spillStartMilli, spillBytes); mapSpillSortCounter.incSpillSingleRecord(); }
Example #15
Source File: MergeManagerImpl.java From big-c with Apache License 2.0 | 4 votes |
@Override public void merge(List<InMemoryMapOutput<K,V>> inputs) throws IOException { if (inputs == null || inputs.size() == 0) { return; } //name this output file same as the name of the first file that is //there in the current list of inmem files (this is guaranteed to //be absent on the disk currently. So we don't overwrite a prev. //created spill). Also we need to create the output file now since //it is not guaranteed that this file will be present after merge //is called (we delete empty files as soon as we see them //in the merge method) //figure out the mapId TaskAttemptID mapId = inputs.get(0).getMapId(); TaskID mapTaskId = mapId.getTaskID(); List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>(); long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments,0); int noInMemorySegments = inMemorySegments.size(); Path outputPath = mapOutputFile.getInputFileForWrite(mapTaskId, mergeOutputSize).suffix( Task.MERGED_OUTPUT_PREFIX); FSDataOutputStream out = CryptoUtils.wrapIfNecessary(jobConf, rfs.create(outputPath)); Writer<K, V> writer = new Writer<K, V>(jobConf, out, (Class<K>) jobConf.getMapOutputKeyClass(), (Class<V>) jobConf.getMapOutputValueClass(), codec, null, true); RawKeyValueIterator rIter = null; CompressAwarePath compressAwarePath; try { LOG.info("Initiating in-memory merge with " + noInMemorySegments + " segments..."); rIter = Merger.merge(jobConf, rfs, (Class<K>)jobConf.getMapOutputKeyClass(), (Class<V>)jobConf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceId.toString()), (RawComparator<K>)jobConf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null, null); if (null == combinerClass) { Merger.writeFile(rIter, writer, reporter, jobConf); } else { combineCollector.setWriter(writer); combineAndSpill(rIter, reduceCombineInputCounter); } writer.close(); compressAwarePath = new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()); LOG.info(reduceId + " Merge of the " + noInMemorySegments + " files in-memory complete." + " Local file is " + outputPath + " of size " + localFS.getFileStatus(outputPath).getLen()); } catch (IOException e) { //make sure that we delete the ondisk file that we created //earlier when we invoked cloneFileAttributes localFS.delete(outputPath, true); throw e; } // Note the output of the merge closeOnDiskFile(compressAwarePath); }
Example #16
Source File: MergeManagerImpl.java From big-c with Apache License 2.0 | 4 votes |
@Override public void merge(List<CompressAwarePath> inputs) throws IOException { // sanity check if (inputs == null || inputs.isEmpty()) { LOG.info("No ondisk files to merge..."); return; } long approxOutputSize = 0; int bytesPerSum = jobConf.getInt("io.bytes.per.checksum", 512); LOG.info("OnDiskMerger: We have " + inputs.size() + " map outputs on disk. Triggering merge..."); // 1. Prepare the list of files to be merged. for (CompressAwarePath file : inputs) { approxOutputSize += localFS.getFileStatus(file).getLen(); } // add the checksum length approxOutputSize += ChecksumFileSystem.getChecksumLength(approxOutputSize, bytesPerSum); // 2. Start the on-disk merge process Path outputPath = localDirAllocator.getLocalPathForWrite(inputs.get(0).toString(), approxOutputSize, jobConf).suffix(Task.MERGED_OUTPUT_PREFIX); FSDataOutputStream out = CryptoUtils.wrapIfNecessary(jobConf, rfs.create(outputPath)); Writer<K, V> writer = new Writer<K, V>(jobConf, out, (Class<K>) jobConf.getMapOutputKeyClass(), (Class<V>) jobConf.getMapOutputValueClass(), codec, null, true); RawKeyValueIterator iter = null; CompressAwarePath compressAwarePath; Path tmpDir = new Path(reduceId.toString()); try { iter = Merger.merge(jobConf, rfs, (Class<K>) jobConf.getMapOutputKeyClass(), (Class<V>) jobConf.getMapOutputValueClass(), codec, inputs.toArray(new Path[inputs.size()]), true, ioSortFactor, tmpDir, (RawComparator<K>) jobConf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null, mergedMapOutputsCounter, null); Merger.writeFile(iter, writer, reporter, jobConf); writer.close(); compressAwarePath = new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()); } catch (IOException e) { localFS.delete(outputPath, true); throw e; } closeOnDiskFile(compressAwarePath); LOG.info(reduceId + " Finished merging " + inputs.size() + " map output files on disk of total-size " + approxOutputSize + "." + " Local output file is " + outputPath + " of size " + localFS.getFileStatus(outputPath).getLen()); }
Example #17
Source File: Task.java From RDFS with Apache License 2.0 | 4 votes |
public synchronized void setWriter(Writer<K, V> writer) { this.writer = writer; }
Example #18
Source File: MapTask.java From RDFS with Apache License 2.0 | 4 votes |
/** * Handles the degenerate case where serialization fails to fit in * the in-memory buffer, so we must spill the record from collect * directly to a spill file. Consider this "losing". */ private void spillSingleRecord(final K key, final V value, int partition) throws IOException { long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { long spillStartMilli = System.currentTimeMillis(); ProcResourceValues spillStartProcVals = getCurrentProcResourceValues(); long spillBytes = 0; // create spill file final SpillRecord spillRec = new SpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(getTaskID(), numSpills, size); out = rfs.create(filename); // we don't run the combiner for a single record IndexRecord rec = new IndexRecord(); for (int i = 0; i < partitions; ++i) { IFile.Writer<K, V> writer = null; try { long segmentStart = out.getPos(); // Create a new codec, don't care! writer = new IFile.Writer<K,V>(job, out, keyClass, valClass, codec, spilledRecordsCounter); if (i == partition) { final long recordStart = out.getPos(); writer.append(key, value); // Note that our map byte count will not be accurate with // compression mapOutputByteCounter.increment(out.getPos() - recordStart); } writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); spillBytes += writer.getCompressedLength(); spillRec.putIndex(rec, i); writer = null; } catch (IOException e) { if (null != writer) writer.close(); throw e; } } if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite( getTaskID(), numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } long spillEndMilli = System.currentTimeMillis(); ProcResourceValues spillEndProcVals = getCurrentProcResourceValues(); spillSortCounters.incCountersPerSpill(spillStartProcVals, spillEndProcVals, spillEndMilli - spillStartMilli, spillBytes); ++numSpills; } finally { if (out != null) out.close(); } }
Example #19
Source File: MapTask.java From hadoop-gpu with Apache License 2.0 | 4 votes |
/** * Handles the degenerate case where serialization fails to fit in * the in-memory buffer, so we must spill the record from collect * directly to a spill file. Consider this "losing". */ private void spillSingleRecord(final K key, final V value, int partition) throws IOException { long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { // create spill file final SpillRecord spillRec = new SpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(getTaskID(), numSpills, size); out = rfs.create(filename); // we don't run the combiner for a single record IndexRecord rec = new IndexRecord(); for (int i = 0; i < partitions; ++i) { IFile.Writer<K, V> writer = null; try { long segmentStart = out.getPos(); // Create a new codec, don't care! writer = new IFile.Writer<K,V>(job, out, keyClass, valClass, codec, spilledRecordsCounter); if (i == partition) { final long recordStart = out.getPos(); writer.append(key, value); // Note that our map byte count will not be accurate with // compression mapOutputByteCounter.increment(out.getPos() - recordStart); } writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); spillRec.putIndex(rec, i); writer = null; } catch (IOException e) { if (null != writer) writer.close(); throw e; } } if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite( getTaskID(), numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } ++numSpills; } finally { if (out != null) out.close(); } }
Example #20
Source File: Task.java From hadoop-gpu with Apache License 2.0 | 4 votes |
public synchronized void setWriter(Writer<K, V> writer) { this.writer = writer; }
Example #21
Source File: MapTask.java From hadoop-gpu with Apache License 2.0 | 4 votes |
private void sortAndSpill() throws IOException, ClassNotFoundException, InterruptedException { //approximate the length of the output file to be the length of the //buffer + header lengths for the partitions long size = (bufend >= bufstart ? bufend - bufstart : (bufvoid - bufend) + bufstart) + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { // create spill file final SpillRecord spillRec = new SpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(getTaskID(), numSpills, size); out = rfs.create(filename); final int endPosition = (kvend > kvstart) ? kvend : kvoffsets.length + kvend; sorter.sort(MapOutputBuffer.this, kvstart, endPosition, reporter); int spindex = kvstart; IndexRecord rec = new IndexRecord(); InMemValBytes value = new InMemValBytes(); for (int i = 0; i < partitions; ++i) { IFile.Writer<K, V> writer = null; try { long segmentStart = out.getPos(); writer = new Writer<K, V>(job, out, keyClass, valClass, codec, spilledRecordsCounter); if (combinerRunner == null) { // spill directly DataInputBuffer key = new DataInputBuffer(); while (spindex < endPosition && kvindices[kvoffsets[spindex % kvoffsets.length] + PARTITION] == i) { final int kvoff = kvoffsets[spindex % kvoffsets.length]; getVBytesForOffset(kvoff, value); key.reset(kvbuffer, kvindices[kvoff + KEYSTART], (kvindices[kvoff + VALSTART] - kvindices[kvoff + KEYSTART])); writer.append(key, value); ++spindex; } } else { int spstart = spindex; while (spindex < endPosition && kvindices[kvoffsets[spindex % kvoffsets.length] + PARTITION] == i) { ++spindex; } // Note: we would likeo to avoid the combiner if we've fewer // than some threshold of records for a partition if (spstart != spindex) { combineCollector.setWriter(writer); RawKeyValueIterator kvIter = new MRResultIterator(spstart, spindex); combinerRunner.combine(kvIter, combineCollector); } } // close the writer writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength(); rec.partLength = writer.getCompressedLength(); spillRec.putIndex(rec, i); writer = null; } finally { if (null != writer) writer.close(); } } if (totalIndexCacheMemory >= INDEX_CACHE_MEMORY_LIMIT) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite( getTaskID(), numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } LOG.info("Finished spill " + numSpills); ++numSpills; } finally { if (out != null) out.close(); } }
Example #22
Source File: TestPipeApplication.java From hadoop with Apache License 2.0 | 4 votes |
/** * test PipesMapRunner test the transfer data from reader * * @throws Exception */ @Test public void testRunner() throws Exception { // clean old password files File[] psw = cleanTokenPasswordFile(); try { RecordReader<FloatWritable, NullWritable> rReader = new ReaderPipesMapRunner(); JobConf conf = new JobConf(); conf.set(Submitter.IS_JAVA_RR, "true"); // for stdour and stderror conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName); CombineOutputCollector<IntWritable, Text> output = new CombineOutputCollector<IntWritable, Text>( new Counters.Counter(), new Progress()); FileSystem fs = new RawLocalFileSystem(); fs.setConf(conf); Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create( new Path(workSpace + File.separator + "outfile")), IntWritable.class, Text.class, null, null, true); output.setWriter(wr); // stub for client File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationRunnableStub"); conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath()); // token for authorization Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>( "user".getBytes(), "password".getBytes(), new Text("kind"), new Text( "service")); TokenCache.setJobToken(token, conf.getCredentials()); conf.setBoolean(MRJobConfig.SKIP_RECORDS, true); TestTaskReporter reporter = new TestTaskReporter(); PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text> runner = new PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text>(); initStdOut(conf); runner.configure(conf); runner.run(rReader, output, reporter); String stdOut = readStdOut(conf); // test part of translated data. As common file for client and test - // clients stdOut // check version assertTrue(stdOut.contains("CURRENT_PROTOCOL_VERSION:0")); // check key and value classes assertTrue(stdOut .contains("Key class:org.apache.hadoop.io.FloatWritable")); assertTrue(stdOut .contains("Value class:org.apache.hadoop.io.NullWritable")); // test have sent all data from reader assertTrue(stdOut.contains("value:0.0")); assertTrue(stdOut.contains("value:9.0")); } finally { if (psw != null) { // remove password files for (File file : psw) { file.deleteOnExit(); } } } }
Example #23
Source File: MapTask.java From big-c with Apache License 2.0 | 4 votes |
/** * Handles the degenerate case where serialization fails to fit in * the in-memory buffer, so we must spill the record from collect * directly to a spill file. Consider this "losing". */ private void spillSingleRecord(final K key, final V value, int partition) throws IOException { long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { // create spill file final SpillRecord spillRec = new SpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); out = rfs.create(filename); // we don't run the combiner for a single record IndexRecord rec = new IndexRecord(); for (int i = 0; i < partitions; ++i) { IFile.Writer<K, V> writer = null; try { long segmentStart = out.getPos(); // Create a new codec, don't care! FSDataOutputStream partitionOut = CryptoUtils.wrapIfNecessary(job, out); writer = new IFile.Writer<K,V>(job, partitionOut, keyClass, valClass, codec, spilledRecordsCounter); if (i == partition) { final long recordStart = out.getPos(); writer.append(key, value); // Note that our map byte count will not be accurate with // compression mapOutputByteCounter.increment(out.getPos() - recordStart); } writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job); rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job); spillRec.putIndex(rec, i); writer = null; } catch (IOException e) { if (null != writer) writer.close(); throw e; } } if (totalIndexCacheMemory >= indexCacheMemoryLimit) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } ++numSpills; } finally { if (out != null) out.close(); } }
Example #24
Source File: MapTask.java From big-c with Apache License 2.0 | 4 votes |
private void sortAndSpill() throws IOException, ClassNotFoundException, InterruptedException { //approximate the length of the output file to be the length of the //buffer + header lengths for the partitions final long size = distanceTo(bufstart, bufend, bufvoid) + partitions * APPROX_HEADER_LENGTH; FSDataOutputStream out = null; try { // create spill file final SpillRecord spillRec = new SpillRecord(partitions); final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size); out = rfs.create(filename); final int mstart = kvend / NMETA; final int mend = 1 + // kvend is a valid record (kvstart >= kvend ? kvstart : kvmeta.capacity() + kvstart) / NMETA; sorter.sort(MapOutputBuffer.this, mstart, mend, reporter); int spindex = mstart; final IndexRecord rec = new IndexRecord(); final InMemValBytes value = new InMemValBytes(); for (int i = 0; i < partitions; ++i) { IFile.Writer<K, V> writer = null; try { long segmentStart = out.getPos(); FSDataOutputStream partitionOut = CryptoUtils.wrapIfNecessary(job, out); writer = new Writer<K, V>(job, partitionOut, keyClass, valClass, codec, spilledRecordsCounter); if (combinerRunner == null) { // spill directly DataInputBuffer key = new DataInputBuffer(); while (spindex < mend && kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) { final int kvoff = offsetFor(spindex % maxRec); int keystart = kvmeta.get(kvoff + KEYSTART); int valstart = kvmeta.get(kvoff + VALSTART); key.reset(kvbuffer, keystart, valstart - keystart); getVBytesForOffset(kvoff, value); writer.append(key, value); ++spindex; } } else { int spstart = spindex; while (spindex < mend && kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) { ++spindex; } // Note: we would like to avoid the combiner if we've fewer // than some threshold of records for a partition if (spstart != spindex) { combineCollector.setWriter(writer); RawKeyValueIterator kvIter = new MRResultIterator(spstart, spindex); combinerRunner.combine(kvIter, combineCollector); } } // close the writer writer.close(); // record offsets rec.startOffset = segmentStart; rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job); rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job); spillRec.putIndex(rec, i); writer = null; } finally { if (null != writer) writer.close(); } } if (totalIndexCacheMemory >= indexCacheMemoryLimit) { // create spill index file Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH); spillRec.writeToFile(indexFilename, job); } else { indexCacheList.add(spillRec); totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH; } LOG.info("Finished spill " + numSpills); ++numSpills; } finally { if (out != null) out.close(); } }
Example #25
Source File: Task.java From big-c with Apache License 2.0 | 4 votes |
public synchronized void setWriter(Writer<K, V> writer) { this.writer = writer; }
Example #26
Source File: TestPipeApplication.java From big-c with Apache License 2.0 | 4 votes |
public synchronized void setWriter(Writer<K, V> writer) { this.writer = writer; }
Example #27
Source File: TestPipeApplication.java From big-c with Apache License 2.0 | 4 votes |
/** * test org.apache.hadoop.mapred.pipes.Application * test a internal functions: MessageType.REGISTER_COUNTER, INCREMENT_COUNTER, STATUS, PROGRESS... * * @throws Throwable */ @Test public void testApplication() throws Throwable { JobConf conf = new JobConf(); RecordReader<FloatWritable, NullWritable> rReader = new Reader(); // client for test File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationStub"); TestTaskReporter reporter = new TestTaskReporter(); File[] psw = cleanTokenPasswordFile(); try { conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName); conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath()); // token for authorization Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>( "user".getBytes(), "password".getBytes(), new Text("kind"), new Text( "service")); TokenCache.setJobToken(token, conf.getCredentials()); FakeCollector output = new FakeCollector(new Counters.Counter(), new Progress()); FileSystem fs = new RawLocalFileSystem(); fs.setConf(conf); Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create( new Path(workSpace.getAbsolutePath() + File.separator + "outfile")), IntWritable.class, Text.class, null, null, true); output.setWriter(wr); conf.set(Submitter.PRESERVE_COMMANDFILE, "true"); initStdOut(conf); Application<WritableComparable<IntWritable>, Writable, IntWritable, Text> application = new Application<WritableComparable<IntWritable>, Writable, IntWritable, Text>( conf, rReader, output, reporter, IntWritable.class, Text.class); application.getDownlink().flush(); application.getDownlink().mapItem(new IntWritable(3), new Text("txt")); application.getDownlink().flush(); application.waitForFinish(); wr.close(); // test getDownlink().mapItem(); String stdOut = readStdOut(conf); assertTrue(stdOut.contains("key:3")); assertTrue(stdOut.contains("value:txt")); // reporter test counter, and status should be sended // test MessageType.REGISTER_COUNTER and INCREMENT_COUNTER assertEquals(1.0, reporter.getProgress(), 0.01); assertNotNull(reporter.getCounter("group", "name")); // test status MessageType.STATUS assertEquals(reporter.getStatus(), "PROGRESS"); stdOut = readFile(new File(workSpace.getAbsolutePath() + File.separator + "outfile")); // check MessageType.PROGRESS assertEquals(0.55f, rReader.getProgress(), 0.001); application.getDownlink().close(); // test MessageType.OUTPUT Entry<IntWritable, Text> entry = output.getCollect().entrySet() .iterator().next(); assertEquals(123, entry.getKey().get()); assertEquals("value", entry.getValue().toString()); try { // try to abort application.abort(new Throwable()); fail(); } catch (IOException e) { // abort works ? assertEquals("pipe child exception", e.getMessage()); } } finally { if (psw != null) { // remove password files for (File file : psw) { file.deleteOnExit(); } } } }
Example #28
Source File: TestPipeApplication.java From big-c with Apache License 2.0 | 4 votes |
/** * test PipesMapRunner test the transfer data from reader * * @throws Exception */ @Test public void testRunner() throws Exception { // clean old password files File[] psw = cleanTokenPasswordFile(); try { RecordReader<FloatWritable, NullWritable> rReader = new ReaderPipesMapRunner(); JobConf conf = new JobConf(); conf.set(Submitter.IS_JAVA_RR, "true"); // for stdour and stderror conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName); CombineOutputCollector<IntWritable, Text> output = new CombineOutputCollector<IntWritable, Text>( new Counters.Counter(), new Progress()); FileSystem fs = new RawLocalFileSystem(); fs.setConf(conf); Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create( new Path(workSpace + File.separator + "outfile")), IntWritable.class, Text.class, null, null, true); output.setWriter(wr); // stub for client File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationRunnableStub"); conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath()); // token for authorization Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>( "user".getBytes(), "password".getBytes(), new Text("kind"), new Text( "service")); TokenCache.setJobToken(token, conf.getCredentials()); conf.setBoolean(MRJobConfig.SKIP_RECORDS, true); TestTaskReporter reporter = new TestTaskReporter(); PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text> runner = new PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text>(); initStdOut(conf); runner.configure(conf); runner.run(rReader, output, reporter); String stdOut = readStdOut(conf); // test part of translated data. As common file for client and test - // clients stdOut // check version assertTrue(stdOut.contains("CURRENT_PROTOCOL_VERSION:0")); // check key and value classes assertTrue(stdOut .contains("Key class:org.apache.hadoop.io.FloatWritable")); assertTrue(stdOut .contains("Value class:org.apache.hadoop.io.NullWritable")); // test have sent all data from reader assertTrue(stdOut.contains("value:0.0")); assertTrue(stdOut.contains("value:9.0")); } finally { if (psw != null) { // remove password files for (File file : psw) { file.deleteOnExit(); } } } }
Example #29
Source File: MergeManagerImpl.java From hadoop with Apache License 2.0 | 4 votes |
@Override public void merge(List<CompressAwarePath> inputs) throws IOException { // sanity check if (inputs == null || inputs.isEmpty()) { LOG.info("No ondisk files to merge..."); return; } long approxOutputSize = 0; int bytesPerSum = jobConf.getInt("io.bytes.per.checksum", 512); LOG.info("OnDiskMerger: We have " + inputs.size() + " map outputs on disk. Triggering merge..."); // 1. Prepare the list of files to be merged. for (CompressAwarePath file : inputs) { approxOutputSize += localFS.getFileStatus(file).getLen(); } // add the checksum length approxOutputSize += ChecksumFileSystem.getChecksumLength(approxOutputSize, bytesPerSum); // 2. Start the on-disk merge process Path outputPath = localDirAllocator.getLocalPathForWrite(inputs.get(0).toString(), approxOutputSize, jobConf).suffix(Task.MERGED_OUTPUT_PREFIX); FSDataOutputStream out = CryptoUtils.wrapIfNecessary(jobConf, rfs.create(outputPath)); Writer<K, V> writer = new Writer<K, V>(jobConf, out, (Class<K>) jobConf.getMapOutputKeyClass(), (Class<V>) jobConf.getMapOutputValueClass(), codec, null, true); RawKeyValueIterator iter = null; CompressAwarePath compressAwarePath; Path tmpDir = new Path(reduceId.toString()); try { iter = Merger.merge(jobConf, rfs, (Class<K>) jobConf.getMapOutputKeyClass(), (Class<V>) jobConf.getMapOutputValueClass(), codec, inputs.toArray(new Path[inputs.size()]), true, ioSortFactor, tmpDir, (RawComparator<K>) jobConf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null, mergedMapOutputsCounter, null); Merger.writeFile(iter, writer, reporter, jobConf); writer.close(); compressAwarePath = new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()); } catch (IOException e) { localFS.delete(outputPath, true); throw e; } closeOnDiskFile(compressAwarePath); LOG.info(reduceId + " Finished merging " + inputs.size() + " map output files on disk of total-size " + approxOutputSize + "." + " Local output file is " + outputPath + " of size " + localFS.getFileStatus(outputPath).getLen()); }
Example #30
Source File: MergeManagerImpl.java From hadoop with Apache License 2.0 | 4 votes |
@Override public void merge(List<InMemoryMapOutput<K,V>> inputs) throws IOException { if (inputs == null || inputs.size() == 0) { return; } //name this output file same as the name of the first file that is //there in the current list of inmem files (this is guaranteed to //be absent on the disk currently. So we don't overwrite a prev. //created spill). Also we need to create the output file now since //it is not guaranteed that this file will be present after merge //is called (we delete empty files as soon as we see them //in the merge method) //figure out the mapId TaskAttemptID mapId = inputs.get(0).getMapId(); TaskID mapTaskId = mapId.getTaskID(); List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>(); long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments,0); int noInMemorySegments = inMemorySegments.size(); Path outputPath = mapOutputFile.getInputFileForWrite(mapTaskId, mergeOutputSize).suffix( Task.MERGED_OUTPUT_PREFIX); FSDataOutputStream out = CryptoUtils.wrapIfNecessary(jobConf, rfs.create(outputPath)); Writer<K, V> writer = new Writer<K, V>(jobConf, out, (Class<K>) jobConf.getMapOutputKeyClass(), (Class<V>) jobConf.getMapOutputValueClass(), codec, null, true); RawKeyValueIterator rIter = null; CompressAwarePath compressAwarePath; try { LOG.info("Initiating in-memory merge with " + noInMemorySegments + " segments..."); rIter = Merger.merge(jobConf, rfs, (Class<K>)jobConf.getMapOutputKeyClass(), (Class<V>)jobConf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceId.toString()), (RawComparator<K>)jobConf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null, null); if (null == combinerClass) { Merger.writeFile(rIter, writer, reporter, jobConf); } else { combineCollector.setWriter(writer); combineAndSpill(rIter, reduceCombineInputCounter); } writer.close(); compressAwarePath = new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()); LOG.info(reduceId + " Merge of the " + noInMemorySegments + " files in-memory complete." + " Local file is " + outputPath + " of size " + localFS.getFileStatus(outputPath).getLen()); } catch (IOException e) { //make sure that we delete the ondisk file that we created //earlier when we invoked cloneFileAttributes localFS.delete(outputPath, true); throw e; } // Note the output of the merge closeOnDiskFile(compressAwarePath); }