Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileSplit#getStart()
The following examples show how to use
org.apache.hadoop.mapreduce.lib.input.FileSplit#getStart() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: GryoRecordReader.java From tinkerpop with Apache License 2.0 | 6 votes |
@Override public void initialize(final InputSplit genericSplit, final TaskAttemptContext context) throws IOException { final FileSplit split = (FileSplit) genericSplit; final Configuration configuration = context.getConfiguration(); if (configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null) this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER); this.gryoReader = GryoReader.build().mapper( GryoMapper.build().addRegistries(IoRegistryHelper.createRegistries(ConfUtil.makeApacheConfiguration(configuration))).create()).create(); long start = split.getStart(); final Path file = split.getPath(); if (null != new CompressionCodecFactory(configuration).getCodec(file)) { throw new IllegalStateException("Compression is not supported for the (binary) Gryo format"); } // open the file and seek to the start of the split this.inputStream = file.getFileSystem(configuration).open(split.getPath()); this.splitLength = split.getLength(); if (this.splitLength > 0) this.splitLength -= (seekToHeader(this.inputStream, start) - start); }
Example 2
Source File: XmlCollectionWithTagInputFormat.java From vxquery with Apache License 2.0 | 6 votes |
public XmlRecordReader(FileSplit split, Configuration conf) throws IOException { endTag = ENDING_TAG.getBytes(Charsets.UTF_8); startTag = STARTING_TAG.getBytes(Charsets.UTF_8); // open the file and seek to the start of the split start = split.getStart(); // set the end of the file end = start + split.getLength(); Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); FileStatus fStatus = fs.getFileStatus(file); blocks = fs.getFileBlockLocations(fStatus, 0, fStatus.getLen()); // seek the start of file fsin = fs.open(split.getPath()); fsin.seek(start); }
Example 3
Source File: XMLLoader.java From spork with Apache License 2.0 | 6 votes |
/** * Delegate the initialization method to the wrapped stream after changing * the length of the split to be non-ending. */ @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { key = new LongWritable(); value = new Text(); if (split instanceof FileSplit) { FileSplit fsplit = (FileSplit) split; originalEnd = fsplit.getStart() + fsplit.getLength(); Path path = fsplit.getPath(); long fileEnd = path.getFileSystem(context.getConfiguration()).getFileStatus(path).getLen(); FileSplit extendedSplit = new FileSplit(path, fsplit.getStart(), Math.min(fsplit.getLength() * 10, fileEnd - fsplit.getStart()), fsplit.getLocations()); this.wrapped.initialize(extendedSplit, context); } else { throw new RuntimeException("Cannot override a split of type'"+ split.getClass()+"'"); } }
Example 4
Source File: TestUniformSizeInputFormat.java From big-c with Apache License 2.0 | 6 votes |
private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException { long lastEnd = 0; //Verify if each split's start is matching with the previous end and //we are not missing anything for (InputSplit split : splits) { FileSplit fileSplit = (FileSplit) split; long start = fileSplit.getStart(); Assert.assertEquals(lastEnd, start); lastEnd = start + fileSplit.getLength(); } //Verify there is nothing more to read from the input file SequenceFile.Reader reader = new SequenceFile.Reader(cluster.getFileSystem().getConf(), SequenceFile.Reader.file(listFile)); try { reader.seek(lastEnd); CopyListingFileStatus srcFileStatus = new CopyListingFileStatus(); Text srcRelPath = new Text(); Assert.assertFalse(reader.next(srcRelPath, srcFileStatus)); } finally { IOUtils.closeStream(reader); } }
Example 5
Source File: MultiLineInputFormat.java From dkpro-c4corpus with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { numberOfLinesToProcess = getNumLinesPerSplit(context); FileSplit split = (FileSplit) genericSplit; final Path file = split.getPath(); Configuration conf = context.getConfiguration(); this.maxLineLength = conf .getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE); FileSystem fs = file.getFileSystem(conf); start = split.getStart(); end = start + split.getLength(); boolean skipFirstLine = false; FSDataInputStream filein = fs.open(split.getPath()); if (start != 0) { skipFirstLine = true; --start; filein.seek(start); } in = new LineReader(filein, conf); if (skipFirstLine) { start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
Example 6
Source File: XMLFileRecordReader.java From jumbune with GNU Lesser General Public License v3.0 | 5 votes |
public XMLFileRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { FileSplit fileSplit = (FileSplit) split; start = fileSplit.getStart(); end = start + fileSplit.getLength(); Path file = fileSplit.getPath(); FileSystem fileSystem = file.getFileSystem(context.getConfiguration()); fSDataInputStream = fileSystem.open(fileSplit.getPath()); fSDataInputStream.seek(start); }
Example 7
Source File: JsonFileRecordReader.java From jumbune with GNU Lesser General Public License v3.0 | 5 votes |
public JsonFileRecordReader(InputSplit split, TaskAttemptContext context) throws IOException{ FileSplit fileSplit = (FileSplit) split; start = fileSplit.getStart(); end = start + fileSplit.getLength(); Path file = fileSplit.getPath(); FileSystem fileSystem = file.getFileSystem(context.getConfiguration()); fSDataInputStream = fileSystem.open(fileSplit.getPath()); fSDataInputStream.seek(start); }
Example 8
Source File: JsonDataValidationMapper.java From jumbune with GNU Lesser General Public License v3.0 | 5 votes |
protected void setup(Mapper.Context context){ String jsonString = context.getConfiguration().get(JsonDataVaildationConstants.JSON_ARGUMENT); String regexString = context.getConfiguration().get(JsonDataVaildationConstants.REGEX_ARGUMENT); String nullString = context.getConfiguration().get(JsonDataVaildationConstants.NULL_ARGUMENT); tupleCounter = 0L; cleanTupleCounter =0L; recordsEmittByMap = 0L; //Populating JsonKey and Data type schema = getDatatypeExpression(jsonString); // Adding JsonKey given by user keylist = getKeyList(jsonString); if(!(regexString == null)){ //Populating JsonKey and Regex regex = getExpression(regexString); } if(!(nullString == null)){ //Populating JsonKey and NULLCONDITION nullMap = getExpression(nullString); } FileSplit fileSplit = (FileSplit)context.getInputSplit(); splitStartOffset = fileSplit.getStart(); //calculating end offset of current split splitEndOffset = splitStartOffset + fileSplit.getLength() - 1; filename = fileSplit.getPath().toUri().getPath(); filename = filename.replaceAll(JsonDataVaildationConstants.FORWARD_SLASH, JsonDataVaildationConstants.JSON_DOT).substring(1, filename.length()); }
Example 9
Source File: PhoenixTextInputFormat.java From phoenix with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { rr.initialize(genericSplit, context); final Configuration conf = context.getConfiguration(); final FileSplit split = (FileSplit) genericSplit; if (conf.getBoolean(SKIP_HEADER_KEY, false) && split.getStart() == 0) { LOGGER.trace("Consuming first key-value from {}", genericSplit); nextKeyValue(); } else { LOGGER.trace("Not configured to skip header or not the first input split: {}", split); } }
Example 10
Source File: JSONFileRecordReader.java From ojai with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit arg0, TaskAttemptContext taskContext) throws IOException, InterruptedException { documentStream = null; it = null; documentCount = 0; key = new LongWritable(); document = null; currentPos = 0; /* get the split */ FileSplit split = (FileSplit) arg0; /* get configuration object */ Configuration job = taskContext.getConfiguration(); /* initialize file /input stream */ final Path path = split.getPath(); FileSystem fs = path.getFileSystem(job); inputStream = fs.open(path); CompressionCodec codec = new CompressionCodecFactory(job).getCodec(path); if (codec != null) { decompressor = CodecPool.getDecompressor(codec); inputStream = codec.createInputStream(inputStream, decompressor); } start = split.getStart(); end = start + split.getLength(); /* Initialize a stream reader so that it can read multiple documents from */ /* the file */ documentStream = (JsonDocumentStream)Json.newDocumentStream(inputStream); it = documentStream.iterator(); }
Example 11
Source File: AvroArrayReader.java From spork with Apache License 2.0 | 5 votes |
@Override public void initialize(final InputSplit isplit, final TaskAttemptContext tc) throws IOException, InterruptedException { FileSplit fsplit = (FileSplit) isplit; start = fsplit.getStart(); end = fsplit.getStart() + fsplit.getLength(); DatumReader<GenericData.Array<Object>> datumReader = new GenericDatumReader<GenericData.Array<Object>>(schema); reader = DataFileReader.openReader( new FsInput(fsplit.getPath(), tc.getConfiguration()), datumReader); reader.sync(start); }
Example 12
Source File: FastaInputFormat.java From Hadoop-BAM with MIT License | 5 votes |
public FastaRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); current_split_pos = 1; FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
Example 13
Source File: TFileRecordReader.java From tez with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; LOG.info("Initializing TFileRecordReader : " + fileSplit.getPath().toString()); start = fileSplit.getStart(); end = start + fileSplit.getLength(); FileSystem fs = fileSplit.getPath().getFileSystem(context.getConfiguration()); splitPath = fileSplit.getPath(); fin = fs.open(splitPath); reader = new TFile.Reader(fin, fs.getFileStatus(splitPath).getLen(), context.getConfiguration()); scanner = reader.createScannerByByteRange(start, fileSplit.getLength()); }
Example 14
Source File: AvroRecordReader.java From aliyun-maxcompute-data-collectors with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration conf = context.getConfiguration(); SeekableInput in = new FsInput(split.getPath(), conf); DatumReader<T> datumReader = new GenericDatumReader<T>(); this.reader = DataFileReader.openReader(in, datumReader); reader.sync(split.getStart()); // sync to start this.start = reader.tell(); this.end = split.getStart() + split.getLength(); }
Example 15
Source File: UnenclosedBaseJsonRecordReader.java From spatial-framework-for-hadoop with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit split, TaskAttemptContext taskContext) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit)split; start = fileSplit.getStart(); end = fileSplit.getLength() + start; Path filePath = fileSplit.getPath(); commonInit(filePath, taskContext.getConfiguration()); }
Example 16
Source File: DelimitedVectorInputFormat.java From mrgeo with Apache License 2.0 | 4 votes |
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { boolean useNLineFormat = context.getConfiguration().getBoolean(USE_NLINE_FORMAT, false); if (useNLineFormat) { List<InputSplit> splits = new NLineInputFormat().getSplits(context); // This is a workaround to what appears to be a bug in in how NLineInputFormat // computes its splits. When there are multiple splits in a file, it seems // the start position in the last split is off by one. Note that this corrective // code needs to check the last split for each different file that appears // in the list of splits. for (int index = 2; index < splits.size(); index++) { FileSplit previousSplit = (FileSplit) splits.get(index - 1); FileSplit currSplit = (FileSplit) splits.get(index); // If this index is the last split, or we've moved on to splits from a different // file, then we need to adjust the last split for that file. int lastFileIndex = -1; if (index == splits.size() - 1) { lastFileIndex = index; } else if (!currSplit.getPath().equals(previousSplit.getPath())) { lastFileIndex = index - 1; } if (lastFileIndex >= 2) { FileSplit lastFileSplit = (FileSplit) splits.get(lastFileIndex); FileSplit priorSplit = (FileSplit) splits.get(lastFileIndex - 1); if (lastFileSplit.getPath().equals(priorSplit.getPath())) { if (priorSplit.getPath().equals(lastFileSplit.getPath()) && priorSplit.getStart() + priorSplit.getLength() < lastFileSplit.getStart()) { // Adjust the start of previous split FileSplit replacement = new FileSplit(lastFileSplit.getPath(), priorSplit.getStart() + priorSplit.getLength(), lastFileSplit.getLength() + 1, lastFileSplit.getLocations()); log.info("Replacing split: " + lastFileSplit); log.info(" With split: " + replacement); splits.set(lastFileIndex, replacement); } } } } return splits; } else { return new TextInputFormat().getSplits(context); } }
Example 17
Source File: DataValidationMapper.java From jumbune with GNU Lesser General Public License v3.0 | 4 votes |
@SuppressWarnings("rawtypes") protected void setup(Mapper.Context context) throws IOException, InterruptedException { lineNumber = 0; recordsEmittByMap = 0l ; noOfToupleProcessd = 0l; cleanTupleCounter=0l; lineVWCounterNOF = 0; lineVWCounterNC = 0; lineVWCounterDT = 0; lineVWCounterRX = 0; dataValidatoinDiscripancies = new DataDiscrepanciesArrayWritable(); dataViolationWBNOF = new DataViolationWB(); dataViolationWBNC = new DataViolationWB(); dataViolationWBDT = new DataViolationWB(); dataViolationWBRX = new DataViolationWB(); dataViolationWBArr = new DataViolationWB[1]; // populating data validation parameters String dvBeanString = context.getConfiguration().get(DataValidationConstants.DATA_VALIDATION_BEAN_STRING); String validateMatrix = context.getConfiguration().get(DataValidationConstants.VALIDATE_MATRIX); Gson gson = new Gson(); Type type = new TypeToken<DataValidationBean>() { }.getType(); DataValidationBean dataValidationBean = gson.fromJson(dvBeanString, type); fieldSeparator = dataValidationBean.getFieldSeparator(); fieldSeparator = fieldSeparator.replaceAll(Constants.SPACE_SEPARATOR, Constants.SPACE); fieldValidationList = dataValidationBean.getFieldValidationList(); expectedNumOfFields = dataValidationBean.getNumOfFields(); validateArray = gson.fromJson(validateMatrix, boolean[][].class); keyPattern = new LRUCache<String, Pattern>(expectedNumOfFields) { /** * */ private static final long serialVersionUID = 8594885637377460020L; @Override protected boolean removeEldestEntry(java.util.Map.Entry<String, Pattern> eldest) { if (size() > super.getCapacity()) { return true; } return false; } }; FileSplit split = ((FileSplit) context.getInputSplit()); splitStartOffset = split.getStart(); //calculating end offset of current split splitEndOffset = splitStartOffset + split.getLength() - 1; fileName = split.getPath().toUri().getPath(); fileName = fileName.replaceAll("/", ".").substring(1, fileName.length()); }
Example 18
Source File: DelimitedTextInputFormat.java From marklogic-contentpump with Apache License 2.0 | 4 votes |
public List<InputSplit> getSplits(JobContext job) throws IOException { boolean delimSplit = isSplitInput(job.getConfiguration()); //if delimSplit is true, size of each split is determined by //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat List<InputSplit> splits = super.getSplits(job); if (!delimSplit) { return splits; } if (splits.size()>= SPLIT_COUNT_LIMIT) { //if #splits > 1 million, there is enough parallelism //therefore no point to split LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:" + SPLIT_COUNT_LIMIT); DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT); return splits; } // add header info into splits List<InputSplit> populatedSplits = new ArrayList<InputSplit>(); LOG.info(splits.size() + " DelimitedSplits generated"); Configuration conf = job.getConfiguration(); char delimiter =0; ArrayList<Text> hlist = new ArrayList<Text>(); for (InputSplit file: splits) { FileSplit fsplit = ((FileSplit)file); Path path = fsplit.getPath(); FileSystem fs = path.getFileSystem(conf); if (fsplit.getStart() == 0) { // parse the inSplit, get the header FSDataInputStream fileIn = fs.open(path); String delimStr = conf.get(ConfigConstants.CONF_DELIMITER, ConfigConstants.DEFAULT_DELIMITER); if (delimStr.length() == 1) { delimiter = delimStr.charAt(0); } else { LOG.error("Incorrect delimitor: " + delimiter + ". Expects single character."); } String encoding = conf.get( MarkLogicConstants.OUTPUT_CONTENT_ENCODING, MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING); InputStreamReader instream = new InputStreamReader(fileIn, encoding); CSVParser parser = new CSVParser(instream, CSVParserFormatter. getFormat(delimiter, DelimitedTextReader.encapsulator, true, true)); Iterator<CSVRecord> it = parser.iterator(); String[] header = null; if (it.hasNext()) { CSVRecord record = (CSVRecord)it.next(); Iterator<String> recordIterator = record.iterator(); int recordSize = record.size(); header = new String[recordSize]; for (int i = 0; i < recordSize; i++) { if (recordIterator.hasNext()) { header[i] = (String)recordIterator.next(); } else { throw new IOException("Record size doesn't match the real size"); } } EncodingUtil.handleBOMUTF8(header, 0); hlist.clear(); for (String s : header) { hlist.add(new Text(s)); } } instream.close(); } DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable( hlist.toArray(new Text[hlist.size()])), path, fsplit.getStart(), fsplit.getLength(), fsplit.getLocations()); populatedSplits.add(ds); } return populatedSplits; }
Example 19
Source File: BAMInputFormat.java From Hadoop-BAM with MIT License | 4 votes |
private int addProbabilisticSplits( List<InputSplit> splits, int i, List<InputSplit> newSplits, Configuration cfg) throws IOException { final Path path = ((FileSplit)splits.get(i)).getPath(); try (final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(cfg), path)) { final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg); FileVirtualSplit previousSplit = null; for (; i < splits.size(); ++i) { FileSplit fspl = (FileSplit)splits.get(i); if (!fspl.getPath().equals(path)) break; long beg = fspl.getStart(); long end = beg + fspl.getLength(); long alignedBeg = guesser.guessNextBAMRecordStart(beg, end); // As the guesser goes to the next BGZF block before looking for BAM // records, the ending BGZF blocks have to always be traversed fully. // Hence force the length to be 0xffff, the maximum possible. long alignedEnd = end << 16 | 0xffff; if (alignedBeg == end) { // No records detected in this split: merge it to the previous one. // This could legitimately happen e.g. if we have a split that is // so small that it only contains the middle part of a BGZF block. // // Of course, if it's the first split, then this is simply not a // valid BAM file. // // FIXME: In theory, any number of splits could only contain parts // of the BAM header before we start to see splits that contain BAM // records. For now, we require that the split size is at least as // big as the header and don't handle that case. if (previousSplit == null) throw new IOException("'" + path + "': "+ "no reads in first split: bad BAM file or tiny split size?"); previousSplit.setEndVirtualOffset(alignedEnd); } else { previousSplit = new FileVirtualSplit( path, alignedBeg, alignedEnd, fspl.getLocations()); if (logger.isDebugEnabled()) { final long byteOffset = alignedBeg >>> 16; final long recordOffset = alignedBeg & 0xffff; logger.debug( "Split {}: byte offset: {} record offset: {}, virtual offset: {}", i, byteOffset, recordOffset, alignedBeg); } newSplits.add(previousSplit); } } } return i; }
Example 20
Source File: ParquetInputSplit.java From parquet-mr with Apache License 2.0 | 2 votes |
/** * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}. * * @param split a mapreduce FileSplit * @return a ParquetInputSplit * @throws IOException if there is an error while creating the Parquet split */ static ParquetInputSplit from(FileSplit split) throws IOException { return new ParquetInputSplit(split.getPath(), split.getStart(), split.getStart() + split.getLength(), split.getLength(), split.getLocations(), null); }