org.apache.hadoop.mapreduce.InputSplit#getLength

Source File: AbstractHadoopJob.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Source File: MapRedUtil.java From spork with Apache License 2.0

6 votes

public String inputSplitToString(InputSplit[] splits) throws IOException, InterruptedException {
    // debugging purpose only
    StringBuilder st = new StringBuilder();
    st.append("Number of splits :" + splits.length+"\n");
    long len = 0;
    for (InputSplit split: splits)
        len += split.getLength();
    st.append("Total Length = "+ len +"\n");
    for (int i = 0; i < splits.length; i++) {
        st.append("Input split["+i+"]:\n   Length = "+ splits[i].getLength()+"\n  Locations:\n");
        for (String location :  splits[i].getLocations())
            st.append("    "+location+"\n");
        st.append("\n-----------------------\n");
    }
    return st.toString();
}

Source File: AbstractHadoopJob.java From kylin with Apache License 2.0

6 votes

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Source File: AbstractHadoopJob.java From Kylin with Apache License 2.0

6 votes

protected double getTotalMapInputMB() throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    if (mapInputBytes == 0) {
        throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
    }
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Source File: AggregateXMLReader.java From marklogic-contentpump with Apache License 2.0

6 votes

protected void initStreamReader(InputSplit inSplit) throws IOException,
    InterruptedException {
    start = 0;
    end = inSplit.getLength();
    overflow = false;
    fInputStream = openFile(inSplit, true);
    if (fInputStream == null) {
        return;
    }

    try {
        xmlSR = f.createXMLStreamReader(fInputStream, encoding);
    } catch (XMLStreamException e) {
        LOG.error(e.getMessage(), e);
    }

    if (useAutomaticId) {
        idGen = new IdGenerator(file.toUri().getPath() + "-"
            + ((FileSplit) inSplit).getStart());
    }
}

Source File: BinaryLoader.java From marklogic-contentpump with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    bytesTotal = inSplit.getLength();
    Path file = ((FileSplit)inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    key.set(file.toString());
    byte[] buf = new byte[(int)inSplit.getLength()];
    System.out.println("split length: " + inSplit.getLength());
    try {
        fileIn.readFully(buf);
        value.set(buf, 0, (int) inSplit.getLength());
        System.out.println("value length: " + value.getBytes().length);
        
        hasNext = true;    
    } catch (Exception e) {
        hasNext = false;
    } finally {
        fileIn.close();
    }
}

Source File: DelimitedTextReader.java From marklogic-contentpump with Apache License 2.0

6 votes

protected void initParser(InputSplit inSplit) throws IOException,
    InterruptedException {
    fileIn = openFile(inSplit, true);
    if (fileIn == null) {
        return;
    }
    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
        generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
        if (generateId) {
            idGen = new IdGenerator(file.toUri().getPath() + "-"
                + ((FileSplit) inSplit).getStart());
        } else {
            uriId = 0;
        }
    }
    parser = new CSVParser(instream, CSVParserFormatter.
    		getFormat(delimiter, encapsulator, true,
    				true));
    parserIterator = parser.iterator();
}

Source File: HadoopFormatIO.java From beam with Apache License 2.0

6 votes

/**
 * This is a helper function to compute splits. This method will also calculate size of the data
 * being read. Note: This method is executed exactly once and the splits are retrieved and
 * cached in this. These splits are further used by split() and getEstimatedSizeBytes().
 */
@VisibleForTesting
void computeSplitsIfNecessary() throws IOException, InterruptedException {
  if (inputSplits != null) {
    return;
  }
  createInputFormatInstance();
  List<InputSplit> splits = inputFormatObj.getSplits(Job.getInstance(conf.get()));
  if (splits == null) {
    throw new IOException("Error in computing splits, getSplits() returns null.");
  }
  if (splits.isEmpty()) {
    throw new IOException("Error in computing splits, getSplits() returns a empty list");
  }
  boundedSourceEstimatedSize = 0;
  inputSplits = new ArrayList<>();
  for (InputSplit inputSplit : splits) {
    if (inputSplit == null) {
      throw new IOException(
          "Error in computing splits, split is null in InputSplits list "
              + "populated by getSplits() : ");
    }
    boundedSourceEstimatedSize += inputSplit.getLength();
    inputSplits.add(new SerializableSplit(inputSplit));
  }
}

Source File: CSVReaderBase.java From datawave with Apache License 2.0

5 votes

public void initializeTotalSize(final InputSplit genericSplit) throws IOException {
    try {
        totalSize = genericSplit.getLength() * 4l;
    } catch (InterruptedException ex) {
        throw new IOException("Interrupted Exception thrown while attempting to get split length", ex);
    }
}

Source File: JobSplit.java From big-c with Apache License 2.0

5 votes

public SplitMetaInfo(InputSplit split, long startOffset) throws IOException {
  try {
    this.locations = split.getLocations();
    this.inputDataLength = split.getLength();
    this.startOffset = startOffset;
  } catch (InterruptedException ie) {
    throw new IOException(ie);
  }
}

Source File: SequenceFileAsBinaryInputFormat.java From big-c with Apache License 2.0

5 votes

public void initialize(InputSplit split, TaskAttemptContext context) 
    throws IOException, InterruptedException {
  Path path = ((FileSplit)split).getPath();
  Configuration conf = context.getConfiguration();
  FileSystem fs = path.getFileSystem(conf);
  this.in = new SequenceFile.Reader(fs, path, conf);
  this.end = ((FileSplit)split).getStart() + split.getLength();
  if (((FileSplit)split).getStart() > in.getPosition()) {
    in.sync(((FileSplit)split).getStart());    // sync to start
  }
  this.start = in.getPosition();
  vbytes = in.createValueBytes();
  done = start >= end;
}

Source File: CompositeInputSplit.java From big-c with Apache License 2.0

5 votes

/**
 * Add an InputSplit to this collection.
 * @throws IOException If capacity was not specified during construction
 *                     or if capacity has been reached.
 */
public void add(InputSplit s) throws IOException, InterruptedException {
  if (null == splits) {
    throw new IOException("Uninitialized InputSplit");
  }
  if (fill == splits.length) {
    throw new IOException("Too many splits");
  }
  splits[fill++] = s;
  totsize += s.getLength();
}

Source File: DelimitedJSONReader.java From marklogic-contentpump with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    /* Initialization in super class */
    initConfig(context);  
    /*  Get file(s) in input split */
    setFile(((FileSplit) inSplit).getPath());
    // Initialize reader properties
    generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI,false);
    if (generateId){
        idGen = new IdGenerator(file.toUri().getPath() + "-"
                + ((FileSplit) inSplit).getStart()); 
    } else {
        uriName = conf.get(CONF_INPUT_URI_ID, null);
        mapper = new ObjectMapper();
    }
    bytesRead = 0;
    totalBytes = inSplit.getLength();
    /* Check file status */
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if (status.isDirectory()) {
        iterator = new FileIterator((FileSplit)inSplit, context);
        inSplit = iterator.next();
    }
    /* Initialize buffered reader */
    initFileStream(inSplit);
}

Source File: RDFReader.java From marklogic-contentpump with Apache License 2.0

5 votes

protected void initStream(InputSplit inSplit)
        throws IOException, InterruptedException {
    FSDataInputStream in = openFile(inSplit, false);
    if (in == null) {
        return;
    }
    long size = inSplit.getLength();
    initParser(file.toUri().toASCIIString(), size);
    parse(file.getName(), in);
}

Source File: TezGroupedSplit.java From incubator-tez with Apache License 2.0

5 votes

public void addSplit(InputSplit split) {
  wrappedSplits.add(split);
  try {
    length += split.getLength();
  } catch (Exception e) {
    throw new TezUncheckedException(e);
  }
}

Source File: SequenceFileAsBinaryInputFormat.java From hadoop with Apache License 2.0

5 votes

public void initialize(InputSplit split, TaskAttemptContext context) 
    throws IOException, InterruptedException {
  Path path = ((FileSplit)split).getPath();
  Configuration conf = context.getConfiguration();
  FileSystem fs = path.getFileSystem(conf);
  this.in = new SequenceFile.Reader(fs, path, conf);
  this.end = ((FileSplit)split).getStart() + split.getLength();
  if (((FileSplit)split).getStart() > in.getPosition()) {
    in.sync(((FileSplit)split).getStart());    // sync to start
  }
  this.start = in.getPosition();
  vbytes = in.createValueBytes();
  done = start >= end;
}

Source File: CompositeInputSplit.java From hadoop with Apache License 2.0

5 votes

/**
 * Add an InputSplit to this collection.
 * @throws IOException If capacity was not specified during construction
 *                     or if capacity has been reached.
 */
public void add(InputSplit s) throws IOException, InterruptedException {
  if (null == splits) {
    throw new IOException("Uninitialized InputSplit");
  }
  if (fill == splits.length) {
    throw new IOException("Too many splits");
  }
  splits[fill++] = s;
  totsize += s.getLength();
}

Source File: TabletSplitSplit.java From datawave with Apache License 2.0

5 votes

/**
 * Add an InputSplit to this collection.
 * 
 * @throws IOException
 *             If capacity was not specified during construction or if capacity has been reached.
 * @throws InterruptedException
 */
public void add(InputSplit s) throws IOException, InterruptedException {
    if (null == splits) {
        throw new IOException("Uninitialized InputSplit");
    }
    if (fill == splits.length) {
        throw new IOException("Too many splits");
    }
    
    splits[fill++] = s;
    totsize += s.getLength();
}

Source File: WikiLoader.java From marklogic-contentpump with Apache License 2.0

4 votes

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    Path file = ((FileSplit)inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    byte[] buf = new byte[BUFFER_SIZE];
    long bytesTotal = inSplit.getLength();
    long start = ((FileSplit)inSplit).getStart();
    fileIn.seek(start);
    long bytesRead = 0;
    StringBuilder pages = new StringBuilder();
    int sindex = -1;
    while (true) {
        int length = (int)Math.min(bytesTotal - bytesRead, buf.length);
        int read = fileIn.read(buf, 0, length);
        if (read == -1) {
            System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal +
                    "bytesRead=" + bytesRead);
            break;
        }
        bytesRead += read;  
        String temp = new String(new String(buf, 0, read));
        if (sindex == -1) { // haven't found the start yet    
            sindex = temp.indexOf(BEGIN_PAGE_TAG);
            if (sindex > -1) {
                pages.append(temp.substring(sindex));
            }
        } else if (bytesRead < bytesTotal) { // haven't completed the split
            pages.append(temp);
        } else { // reached the end of this split
            // look for end
            int eindex = 0;
            if (temp.contains(END_DOC_TAG) || // reached the end of doc
                temp.endsWith(END_PAGE_TAG)) {
                eindex = temp.lastIndexOf(END_PAGE_TAG);
                pages.append(temp.substring(0, 
                    eindex + END_PAGE_TAG.length()));   
                System.out.println("Found end of doc.");
            } else { // need to read ahead to look for end of page
                while (true) {
                    read = fileIn.read(buf, 0, READ_AHEAD_SIZE);
                    if (read == -1) { // no more to read
                        System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal +
                                "bytesRead=" + bytesRead);
                        System.out.println(temp);
                        break;
                    }
                    bytesRead += read;
                    // look for end
                    temp = new String(buf, 0, read);
                    eindex = temp.indexOf(END_PAGE_TAG);
                    if (eindex > -1) {
                        pages.append(temp.substring(0, 
                                eindex + END_PAGE_TAG.length()));
                        break;
                    } else {
                        pages.append(temp);
                    }
                }
            }
            break;
        }
    }
    fileIn.close();
    articles = WikiModelProcessor.process(pages);
}

Source File: JobSplit.java From big-c with Apache License 2.0

4 votes

public TaskSplitMetaInfo(InputSplit split, long startOffset) 
throws InterruptedException, IOException {
  this(new TaskSplitIndex("", startOffset), split.getLocations(), 
      split.getLength());
}

Java Code Examples for org.apache.hadoop.mapreduce.InputSplit#getLength()