org.apache.hadoop.mapreduce.lib.input.FileSplit#getStart

Source File: GryoRecordReader.java From tinkerpop with Apache License 2.0

6 votes

@Override
public void initialize(final InputSplit genericSplit, final TaskAttemptContext context) throws IOException {
    final FileSplit split = (FileSplit) genericSplit;
    final Configuration configuration = context.getConfiguration();
    if (configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null)
        this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    this.gryoReader = GryoReader.build().mapper(
            GryoMapper.build().addRegistries(IoRegistryHelper.createRegistries(ConfUtil.makeApacheConfiguration(configuration))).create()).create();
    long start = split.getStart();
    final Path file = split.getPath();
    if (null != new CompressionCodecFactory(configuration).getCodec(file)) {
        throw new IllegalStateException("Compression is not supported for the (binary) Gryo format");
    }
    // open the file and seek to the start of the split
    this.inputStream = file.getFileSystem(configuration).open(split.getPath());
    this.splitLength = split.getLength();
    if (this.splitLength > 0) this.splitLength -= (seekToHeader(this.inputStream, start) - start);
}

Source File: XmlCollectionWithTagInputFormat.java From vxquery with Apache License 2.0

6 votes

public XmlRecordReader(FileSplit split, Configuration conf) throws IOException {
    endTag = ENDING_TAG.getBytes(Charsets.UTF_8);
    startTag = STARTING_TAG.getBytes(Charsets.UTF_8);

    // open the file and seek to the start of the split
    start = split.getStart();
    // set the end of the file
    end = start + split.getLength();
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);
    FileStatus fStatus = fs.getFileStatus(file);
    blocks = fs.getFileBlockLocations(fStatus, 0, fStatus.getLen());
    // seek the start of file
    fsin = fs.open(split.getPath());
    fsin.seek(start);
}

Source File: XMLLoader.java From spork with Apache License 2.0

6 votes

/**
    * Delegate the initialization method to the wrapped stream after changing
    * the length of the split to be non-ending.
    */
   @Override
   public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
     key = new LongWritable();
     value = new Text();
     if (split instanceof FileSplit) {
FileSplit fsplit = (FileSplit) split;
originalEnd = fsplit.getStart() + fsplit.getLength();
Path path = fsplit.getPath();
long fileEnd = path.getFileSystem(context.getConfiguration()).getFileStatus(path).getLen();
FileSplit extendedSplit = new FileSplit(path, fsplit.getStart(),
    Math.min(fsplit.getLength() * 10, fileEnd - fsplit.getStart()), fsplit.getLocations());
this.wrapped.initialize(extendedSplit, context);
     } else {
throw new RuntimeException("Cannot override a split of type'"+
    split.getClass()+"'");
     }
   }

Source File: TestUniformSizeInputFormat.java From big-c with Apache License 2.0

6 votes

private void checkSplits(Path listFile, List<InputSplit> splits) throws IOException {
  long lastEnd = 0;

  //Verify if each split's start is matching with the previous end and
  //we are not missing anything
  for (InputSplit split : splits) {
    FileSplit fileSplit = (FileSplit) split;
    long start = fileSplit.getStart();
    Assert.assertEquals(lastEnd, start);
    lastEnd = start + fileSplit.getLength();
  }

  //Verify there is nothing more to read from the input file
  SequenceFile.Reader reader
          = new SequenceFile.Reader(cluster.getFileSystem().getConf(),
                  SequenceFile.Reader.file(listFile));

  try {
    reader.seek(lastEnd);
    CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
    Text srcRelPath = new Text();
    Assert.assertFalse(reader.next(srcRelPath, srcFileStatus));
  } finally {
    IOUtils.closeStream(reader);
  }
}

Source File: MultiLineInputFormat.java From dkpro-c4corpus with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException
{
    numberOfLinesToProcess = getNumLinesPerSplit(context);
    FileSplit split = (FileSplit) genericSplit;
    final Path file = split.getPath();
    Configuration conf = context.getConfiguration();
    this.maxLineLength = conf
            .getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE);
    FileSystem fs = file.getFileSystem(conf);
    start = split.getStart();
    end = start + split.getLength();
    boolean skipFirstLine = false;
    FSDataInputStream filein = fs.open(split.getPath());

    if (start != 0) {
        skipFirstLine = true;
        --start;
        filein.seek(start);
    }
    in = new LineReader(filein, conf);
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

Source File: XMLFileRecordReader.java From jumbune with GNU Lesser General Public License v3.0

5 votes

public XMLFileRecordReader(InputSplit split, TaskAttemptContext context) throws IOException
   {
   	FileSplit fileSplit = (FileSplit) split;	
       start = fileSplit.getStart();
       end = start + fileSplit.getLength();
       Path file = fileSplit.getPath();
       FileSystem fileSystem = file.getFileSystem(context.getConfiguration());
       fSDataInputStream = fileSystem.open(fileSplit.getPath());
       fSDataInputStream.seek(start);
}

Source File: JsonFileRecordReader.java From jumbune with GNU Lesser General Public License v3.0

5 votes

public JsonFileRecordReader(InputSplit split, TaskAttemptContext context) throws IOException{
	FileSplit fileSplit = (FileSplit) split;	
       start = fileSplit.getStart();
       end = start + fileSplit.getLength();
       Path file = fileSplit.getPath();
       FileSystem fileSystem = file.getFileSystem(context.getConfiguration());
       fSDataInputStream = fileSystem.open(fileSplit.getPath());
       fSDataInputStream.seek(start);
}

Source File: JsonDataValidationMapper.java From jumbune with GNU Lesser General Public License v3.0

5 votes

protected void setup(Mapper.Context context){		
	String jsonString = context.getConfiguration().get(JsonDataVaildationConstants.JSON_ARGUMENT);
	String regexString = context.getConfiguration().get(JsonDataVaildationConstants.REGEX_ARGUMENT);
	String nullString = context.getConfiguration().get(JsonDataVaildationConstants.NULL_ARGUMENT);
	tupleCounter = 0L;
	cleanTupleCounter =0L;
	recordsEmittByMap = 0L;
	//Populating JsonKey and Data type
	schema = getDatatypeExpression(jsonString);
	// Adding JsonKey given by user
	keylist = getKeyList(jsonString);

	if(!(regexString == null)){
		//Populating JsonKey and Regex
		regex = getExpression(regexString);
	}
	if(!(nullString == null)){
		//Populating JsonKey and NULLCONDITION
		nullMap = getExpression(nullString);
	}

	FileSplit fileSplit = (FileSplit)context.getInputSplit();
	splitStartOffset = fileSplit.getStart();
	//calculating end offset of current split
	splitEndOffset = splitStartOffset + fileSplit.getLength() - 1;
	filename = fileSplit.getPath().toUri().getPath();
	filename = filename.replaceAll(JsonDataVaildationConstants.FORWARD_SLASH, JsonDataVaildationConstants.JSON_DOT).substring(1, filename.length());
}

Source File: PhoenixTextInputFormat.java From phoenix with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
  rr.initialize(genericSplit, context);
  final Configuration conf = context.getConfiguration();
  final FileSplit split = (FileSplit) genericSplit;
  if (conf.getBoolean(SKIP_HEADER_KEY, false) && split.getStart() == 0) {
    LOGGER.trace("Consuming first key-value from {}", genericSplit);
    nextKeyValue();
  } else {
    LOGGER.trace("Not configured to skip header or not the first input split: {}", split);
  }
}

Source File: JSONFileRecordReader.java From ojai with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit arg0, TaskAttemptContext taskContext)
    throws IOException, InterruptedException {

  documentStream = null;
  it = null;
  documentCount = 0;
  key = new LongWritable();
  document = null;
  currentPos = 0;

  /* get the split */
  FileSplit split = (FileSplit) arg0;

  /* get configuration object */
  Configuration job = taskContext.getConfiguration();

  /* initialize file /input stream */
  final Path path = split.getPath();
  FileSystem fs = path.getFileSystem(job);
  inputStream = fs.open(path);

  CompressionCodec codec = new CompressionCodecFactory(job).getCodec(path);

  if (codec != null) {
    decompressor = CodecPool.getDecompressor(codec);
    inputStream = codec.createInputStream(inputStream, decompressor);
  }

  start = split.getStart();
  end = start + split.getLength();

  /* Initialize a stream reader so that it can read multiple documents from */
  /* the file */

  documentStream = (JsonDocumentStream)Json.newDocumentStream(inputStream);
  it = documentStream.iterator();

}

Source File: AvroArrayReader.java From spork with Apache License 2.0

5 votes

@Override
public void initialize(final InputSplit isplit, final TaskAttemptContext tc)
    throws IOException, InterruptedException {

  FileSplit fsplit = (FileSplit) isplit;
  start  = fsplit.getStart();
  end    = fsplit.getStart() + fsplit.getLength();
  DatumReader<GenericData.Array<Object>> datumReader
    = new GenericDatumReader<GenericData.Array<Object>>(schema);
  reader = DataFileReader.openReader(
      new FsInput(fsplit.getPath(), tc.getConfiguration()),
      datumReader);
  reader.sync(start);
}

Source File: FastaInputFormat.java From Hadoop-BAM with MIT License

5 votes

public FastaRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();
	current_split_pos = 1;

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}

Source File: TFileRecordReader.java From tez with Apache License 2.0

5 votes

@Override public void initialize(InputSplit split, TaskAttemptContext context)
    throws IOException, InterruptedException {
  FileSplit fileSplit = (FileSplit) split;
  LOG.info("Initializing TFileRecordReader : " + fileSplit.getPath().toString());
  start = fileSplit.getStart();
  end = start + fileSplit.getLength();

  FileSystem fs = fileSplit.getPath().getFileSystem(context.getConfiguration());
  splitPath = fileSplit.getPath();
  fin = fs.open(splitPath);
  reader = new TFile.Reader(fin, fs.getFileStatus(splitPath).getLen(),
      context.getConfiguration());
  scanner = reader.createScannerByByteRange(start, fileSplit.getLength());
}

Source File: AvroRecordReader.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration conf = context.getConfiguration();
  SeekableInput in = new FsInput(split.getPath(), conf);
  DatumReader<T> datumReader = new GenericDatumReader<T>();
  this.reader = DataFileReader.openReader(in, datumReader);
  reader.sync(split.getStart());                    // sync to start
  this.start = reader.tell();
  this.end = split.getStart() + split.getLength();
}

Source File: UnenclosedBaseJsonRecordReader.java From spatial-framework-for-hadoop with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit split, TaskAttemptContext taskContext)
			throws IOException, InterruptedException {
	FileSplit fileSplit = (FileSplit)split;
	start = fileSplit.getStart();
	end = fileSplit.getLength() + start;
	Path filePath = fileSplit.getPath();
       commonInit(filePath, taskContext.getConfiguration());
}

Source File: DelimitedVectorInputFormat.java From mrgeo with Apache License 2.0

4 votes

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException
{
  boolean useNLineFormat = context.getConfiguration().getBoolean(USE_NLINE_FORMAT, false);
  if (useNLineFormat)
  {
    List<InputSplit> splits = new NLineInputFormat().getSplits(context);
    // This is a workaround to what appears to be a bug in in how NLineInputFormat
    // computes its splits. When there are multiple splits in a file, it seems
    // the start position in the last split is off by one. Note that this corrective
    // code needs to check the last split for each different file that appears
    // in the list of splits.
    for (int index = 2; index < splits.size(); index++)
    {
      FileSplit previousSplit = (FileSplit) splits.get(index - 1);
      FileSplit currSplit = (FileSplit) splits.get(index);
      // If this index is the last split, or we've moved on to splits from a different
      // file, then we need to adjust the last split for that file.
      int lastFileIndex = -1;
      if (index == splits.size() - 1)
      {
        lastFileIndex = index;
      }
      else if (!currSplit.getPath().equals(previousSplit.getPath()))
      {
        lastFileIndex = index - 1;
      }
      if (lastFileIndex >= 2)
      {
        FileSplit lastFileSplit = (FileSplit) splits.get(lastFileIndex);
        FileSplit priorSplit = (FileSplit) splits.get(lastFileIndex - 1);
        if (lastFileSplit.getPath().equals(priorSplit.getPath()))
        {
          if (priorSplit.getPath().equals(lastFileSplit.getPath()) &&
              priorSplit.getStart() + priorSplit.getLength() < lastFileSplit.getStart())
          {
            // Adjust the start of previous split
            FileSplit replacement = new FileSplit(lastFileSplit.getPath(),
                priorSplit.getStart() + priorSplit.getLength(),
                lastFileSplit.getLength() + 1,
                lastFileSplit.getLocations());
            log.info("Replacing split: " + lastFileSplit);
            log.info("  With split: " + replacement);
            splits.set(lastFileIndex, replacement);
          }
        }
      }
    }
    return splits;
  }
  else
  {
    return new TextInputFormat().getSplits(context);
  }
}

Source File: DataValidationMapper.java From jumbune with GNU Lesser General Public License v3.0

4 votes

@SuppressWarnings("rawtypes")
protected void setup(Mapper.Context context) throws IOException, InterruptedException {
	lineNumber = 0;		
	recordsEmittByMap = 0l ;
	
	noOfToupleProcessd = 0l;
	cleanTupleCounter=0l;
	lineVWCounterNOF = 0;
	lineVWCounterNC = 0;
	lineVWCounterDT = 0;
	lineVWCounterRX = 0;
	dataValidatoinDiscripancies = new DataDiscrepanciesArrayWritable();
	dataViolationWBNOF = new DataViolationWB();
	dataViolationWBNC = new DataViolationWB();
	dataViolationWBDT = new DataViolationWB();
	dataViolationWBRX = new DataViolationWB();
	
	dataViolationWBArr = new DataViolationWB[1];
	
	// populating data validation parameters
	String dvBeanString = context.getConfiguration().get(DataValidationConstants.DATA_VALIDATION_BEAN_STRING);
	String validateMatrix = context.getConfiguration().get(DataValidationConstants.VALIDATE_MATRIX);
	Gson gson = new Gson();
	Type type = new TypeToken<DataValidationBean>() {
	}.getType();
	DataValidationBean dataValidationBean = gson.fromJson(dvBeanString, type);

	fieldSeparator = dataValidationBean.getFieldSeparator();
	fieldSeparator = fieldSeparator.replaceAll(Constants.SPACE_SEPARATOR, Constants.SPACE);
	fieldValidationList = dataValidationBean.getFieldValidationList();
	expectedNumOfFields = dataValidationBean.getNumOfFields();
	validateArray = gson.fromJson(validateMatrix, boolean[][].class);
	keyPattern = new LRUCache<String, Pattern>(expectedNumOfFields) {

		/**
		 * 
		 */
		private static final long serialVersionUID = 8594885637377460020L;

		@Override
		protected boolean removeEldestEntry(java.util.Map.Entry<String, Pattern> eldest) {
			if (size() > super.getCapacity()) {
				return true;
			}
			return false;
		}
	};
	

	FileSplit split = ((FileSplit) context.getInputSplit());		
	splitStartOffset = split.getStart();
	//calculating end offset of current split
	splitEndOffset = splitStartOffset + split.getLength() - 1;
	fileName = split.getPath().toUri().getPath();
	fileName = fileName.replaceAll("/", ".").substring(1, fileName.length());
}

Source File: DelimitedTextInputFormat.java From marklogic-contentpump with Apache License 2.0

4 votes

public List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean delimSplit = isSplitInput(job.getConfiguration());
    //if delimSplit is true, size of each split is determined by 
    //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat
    List<InputSplit> splits = super.getSplits(job);
    if (!delimSplit) {
        return splits;
    }

    if (splits.size()>= SPLIT_COUNT_LIMIT) {
        //if #splits > 1 million, there is enough parallelism
        //therefore no point to split
        LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:"
            + SPLIT_COUNT_LIMIT);
        DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT);
        return splits;
    }
    // add header info into splits
    List<InputSplit> populatedSplits = new ArrayList<InputSplit>();
    LOG.info(splits.size() + " DelimitedSplits generated");
    Configuration conf = job.getConfiguration();
    char delimiter =0;
    ArrayList<Text> hlist = new ArrayList<Text>();
    for (InputSplit file: splits) {
        FileSplit fsplit = ((FileSplit)file);
        Path path = fsplit.getPath();
        FileSystem fs = path.getFileSystem(conf);
        
        if (fsplit.getStart() == 0) {
        // parse the inSplit, get the header
            FSDataInputStream fileIn = fs.open(path);

            String delimStr = conf.get(ConfigConstants.CONF_DELIMITER,
                ConfigConstants.DEFAULT_DELIMITER);
            if (delimStr.length() == 1) {
                delimiter = delimStr.charAt(0);
            } else {
                LOG.error("Incorrect delimitor: " + delimiter
                    + ". Expects single character.");
            }
            String encoding = conf.get(
                MarkLogicConstants.OUTPUT_CONTENT_ENCODING,
                MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING);
            InputStreamReader instream = new InputStreamReader(fileIn, encoding);
            CSVParser parser = new CSVParser(instream, CSVParserFormatter.
            		getFormat(delimiter, DelimitedTextReader.encapsulator,
            				true, true));
            Iterator<CSVRecord> it = parser.iterator();
            
            String[] header = null;
            if (it.hasNext()) {
            	CSVRecord record = (CSVRecord)it.next();
            	Iterator<String> recordIterator = record.iterator();
                int recordSize = record.size();
                header = new String[recordSize];
                for (int i = 0; i < recordSize; i++) {
                	if (recordIterator.hasNext()) {
                		header[i] = (String)recordIterator.next();
                	} else {
                		throw new IOException("Record size doesn't match the real size");
                	}
                }
                
                EncodingUtil.handleBOMUTF8(header, 0);
                
                hlist.clear();
                for (String s : header) {
                    hlist.add(new Text(s));
                }
            }
            instream.close();
        }
        
        DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(
            hlist.toArray(new Text[hlist.size()])), path,
            fsplit.getStart(), fsplit.getLength(),
            fsplit.getLocations());
        populatedSplits.add(ds);
    }
    
    return populatedSplits;
}

Source File: BAMInputFormat.java From Hadoop-BAM with MIT License

4 votes

private int addProbabilisticSplits(
		List<InputSplit> splits, int i, List<InputSplit> newSplits,
		Configuration cfg)
	throws IOException
{
	final Path path = ((FileSplit)splits.get(i)).getPath();
       try (final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(cfg), path)) {

           final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg);

           FileVirtualSplit previousSplit = null;

           for (; i < splits.size(); ++i) {
               FileSplit fspl = (FileSplit)splits.get(i);
               if (!fspl.getPath().equals(path))
                   break;

               long beg =       fspl.getStart();
               long end = beg + fspl.getLength();

               long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);

               // As the guesser goes to the next BGZF block before looking for BAM
               // records, the ending BGZF blocks have to always be traversed fully.
               // Hence force the length to be 0xffff, the maximum possible.
               long alignedEnd = end << 16 | 0xffff;

               if (alignedBeg == end) {
                   // No records detected in this split: merge it to the previous one.
                   // This could legitimately happen e.g. if we have a split that is
                   // so small that it only contains the middle part of a BGZF block.
                   //
                   // Of course, if it's the first split, then this is simply not a
                   // valid BAM file.
                   //
                   // FIXME: In theory, any number of splits could only contain parts
                   // of the BAM header before we start to see splits that contain BAM
                   // records. For now, we require that the split size is at least as
                   // big as the header and don't handle that case.
                   if (previousSplit == null)
                       throw new IOException("'" + path + "': "+
                           "no reads in first split: bad BAM file or tiny split size?");

                   previousSplit.setEndVirtualOffset(alignedEnd);
               } else {
                   previousSplit = new FileVirtualSplit(
                                           path, alignedBeg, alignedEnd, fspl.getLocations());
                   if (logger.isDebugEnabled()) {
                       final long byteOffset  = alignedBeg >>> 16;
                       final long recordOffset = alignedBeg & 0xffff;
                       logger.debug(
                           "Split {}: byte offset: {} record offset: {}, virtual offset: {}",
                           i, byteOffset, recordOffset, alignedBeg);
                   }
                   newSplits.add(previousSplit);
               }
           }
       }
       return i;
}

Source File: ParquetInputSplit.java From parquet-mr with Apache License 2.0

2 votes

/**
 * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}.
 *
 * @param split a mapreduce FileSplit
 * @return a ParquetInputSplit
 * @throws IOException if there is an error while creating the Parquet split
 */
static ParquetInputSplit from(FileSplit split) throws IOException {
  return new ParquetInputSplit(split.getPath(),
      split.getStart(), split.getStart() + split.getLength(),
      split.getLength(), split.getLocations(), null);
}

Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileSplit#getStart()