Java Code Examples for org.apache.hadoop.mapreduce.InputSplit#getLength()
The following examples show how to use
org.apache.hadoop.mapreduce.InputSplit#getLength() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AbstractHadoopJob.java From kylin-on-parquet-v2 with Apache License 2.0 | 6 votes |
public static double getTotalMapInputMB(Job job) throws ClassNotFoundException, IOException, InterruptedException, JobException { if (job == null) { throw new JobException("Job is null"); } long mapInputBytes = 0; InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); for (InputSplit split : input.getSplits(job)) { mapInputBytes += split.getLength(); } // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) if (mapInputBytes == 0) { logger.warn("Map input splits are 0 bytes, something is wrong?"); } double totalMapInputMB = (double) mapInputBytes / 1024 / 1024; return totalMapInputMB; }
Example 2
Source File: MapRedUtil.java From spork with Apache License 2.0 | 6 votes |
public String inputSplitToString(InputSplit[] splits) throws IOException, InterruptedException { // debugging purpose only StringBuilder st = new StringBuilder(); st.append("Number of splits :" + splits.length+"\n"); long len = 0; for (InputSplit split: splits) len += split.getLength(); st.append("Total Length = "+ len +"\n"); for (int i = 0; i < splits.length; i++) { st.append("Input split["+i+"]:\n Length = "+ splits[i].getLength()+"\n Locations:\n"); for (String location : splits[i].getLocations()) st.append(" "+location+"\n"); st.append("\n-----------------------\n"); } return st.toString(); }
Example 3
Source File: AbstractHadoopJob.java From kylin with Apache License 2.0 | 6 votes |
public static double getTotalMapInputMB(Job job) throws ClassNotFoundException, IOException, InterruptedException, JobException { if (job == null) { throw new JobException("Job is null"); } long mapInputBytes = 0; InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); for (InputSplit split : input.getSplits(job)) { mapInputBytes += split.getLength(); } // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) if (mapInputBytes == 0) { logger.warn("Map input splits are 0 bytes, something is wrong?"); } double totalMapInputMB = (double) mapInputBytes / 1024 / 1024; return totalMapInputMB; }
Example 4
Source File: AbstractHadoopJob.java From Kylin with Apache License 2.0 | 6 votes |
protected double getTotalMapInputMB() throws ClassNotFoundException, IOException, InterruptedException, JobException { if (job == null) { throw new JobException("Job is null"); } long mapInputBytes = 0; InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); for (InputSplit split : input.getSplits(job)) { mapInputBytes += split.getLength(); } if (mapInputBytes == 0) { throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!"); } double totalMapInputMB = (double) mapInputBytes / 1024 / 1024; return totalMapInputMB; }
Example 5
Source File: AggregateXMLReader.java From marklogic-contentpump with Apache License 2.0 | 6 votes |
protected void initStreamReader(InputSplit inSplit) throws IOException, InterruptedException { start = 0; end = inSplit.getLength(); overflow = false; fInputStream = openFile(inSplit, true); if (fInputStream == null) { return; } try { xmlSR = f.createXMLStreamReader(fInputStream, encoding); } catch (XMLStreamException e) { LOG.error(e.getMessage(), e); } if (useAutomaticId) { idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart()); } }
Example 6
Source File: BinaryLoader.java From marklogic-contentpump with Apache License 2.0 | 6 votes |
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { bytesTotal = inSplit.getLength(); Path file = ((FileSplit)inSplit).getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(file); key.set(file.toString()); byte[] buf = new byte[(int)inSplit.getLength()]; System.out.println("split length: " + inSplit.getLength()); try { fileIn.readFully(buf); value.set(buf, 0, (int) inSplit.getLength()); System.out.println("value length: " + value.getBytes().length); hasNext = true; } catch (Exception e) { hasNext = false; } finally { fileIn.close(); } }
Example 7
Source File: DelimitedTextReader.java From marklogic-contentpump with Apache License 2.0 | 6 votes |
protected void initParser(InputSplit inSplit) throws IOException, InterruptedException { fileIn = openFile(inSplit, true); if (fileIn == null) { return; } instream = new InputStreamReader(fileIn, encoding); bytesRead = 0; fileLen = inSplit.getLength(); if (uriName == null) { generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false); if (generateId) { idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart()); } else { uriId = 0; } } parser = new CSVParser(instream, CSVParserFormatter. getFormat(delimiter, encapsulator, true, true)); parserIterator = parser.iterator(); }
Example 8
Source File: HadoopFormatIO.java From beam with Apache License 2.0 | 6 votes |
/** * This is a helper function to compute splits. This method will also calculate size of the data * being read. Note: This method is executed exactly once and the splits are retrieved and * cached in this. These splits are further used by split() and getEstimatedSizeBytes(). */ @VisibleForTesting void computeSplitsIfNecessary() throws IOException, InterruptedException { if (inputSplits != null) { return; } createInputFormatInstance(); List<InputSplit> splits = inputFormatObj.getSplits(Job.getInstance(conf.get())); if (splits == null) { throw new IOException("Error in computing splits, getSplits() returns null."); } if (splits.isEmpty()) { throw new IOException("Error in computing splits, getSplits() returns a empty list"); } boundedSourceEstimatedSize = 0; inputSplits = new ArrayList<>(); for (InputSplit inputSplit : splits) { if (inputSplit == null) { throw new IOException( "Error in computing splits, split is null in InputSplits list " + "populated by getSplits() : "); } boundedSourceEstimatedSize += inputSplit.getLength(); inputSplits.add(new SerializableSplit(inputSplit)); } }
Example 9
Source File: CSVReaderBase.java From datawave with Apache License 2.0 | 5 votes |
public void initializeTotalSize(final InputSplit genericSplit) throws IOException { try { totalSize = genericSplit.getLength() * 4l; } catch (InterruptedException ex) { throw new IOException("Interrupted Exception thrown while attempting to get split length", ex); } }
Example 10
Source File: JobSplit.java From big-c with Apache License 2.0 | 5 votes |
public SplitMetaInfo(InputSplit split, long startOffset) throws IOException { try { this.locations = split.getLocations(); this.inputDataLength = split.getLength(); this.startOffset = startOffset; } catch (InterruptedException ie) { throw new IOException(ie); } }
Example 11
Source File: SequenceFileAsBinaryInputFormat.java From big-c with Apache License 2.0 | 5 votes |
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Path path = ((FileSplit)split).getPath(); Configuration conf = context.getConfiguration(); FileSystem fs = path.getFileSystem(conf); this.in = new SequenceFile.Reader(fs, path, conf); this.end = ((FileSplit)split).getStart() + split.getLength(); if (((FileSplit)split).getStart() > in.getPosition()) { in.sync(((FileSplit)split).getStart()); // sync to start } this.start = in.getPosition(); vbytes = in.createValueBytes(); done = start >= end; }
Example 12
Source File: CompositeInputSplit.java From big-c with Apache License 2.0 | 5 votes |
/** * Add an InputSplit to this collection. * @throws IOException If capacity was not specified during construction * or if capacity has been reached. */ public void add(InputSplit s) throws IOException, InterruptedException { if (null == splits) { throw new IOException("Uninitialized InputSplit"); } if (fill == splits.length) { throw new IOException("Too many splits"); } splits[fill++] = s; totsize += s.getLength(); }
Example 13
Source File: DelimitedJSONReader.java From marklogic-contentpump with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { /* Initialization in super class */ initConfig(context); /* Get file(s) in input split */ setFile(((FileSplit) inSplit).getPath()); // Initialize reader properties generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI,false); if (generateId){ idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart()); } else { uriName = conf.get(CONF_INPUT_URI_ID, null); mapper = new ObjectMapper(); } bytesRead = 0; totalBytes = inSplit.getLength(); /* Check file status */ fs = file.getFileSystem(context.getConfiguration()); FileStatus status = fs.getFileStatus(file); if (status.isDirectory()) { iterator = new FileIterator((FileSplit)inSplit, context); inSplit = iterator.next(); } /* Initialize buffered reader */ initFileStream(inSplit); }
Example 14
Source File: RDFReader.java From marklogic-contentpump with Apache License 2.0 | 5 votes |
protected void initStream(InputSplit inSplit) throws IOException, InterruptedException { FSDataInputStream in = openFile(inSplit, false); if (in == null) { return; } long size = inSplit.getLength(); initParser(file.toUri().toASCIIString(), size); parse(file.getName(), in); }
Example 15
Source File: TezGroupedSplit.java From incubator-tez with Apache License 2.0 | 5 votes |
public void addSplit(InputSplit split) { wrappedSplits.add(split); try { length += split.getLength(); } catch (Exception e) { throw new TezUncheckedException(e); } }
Example 16
Source File: SequenceFileAsBinaryInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { Path path = ((FileSplit)split).getPath(); Configuration conf = context.getConfiguration(); FileSystem fs = path.getFileSystem(conf); this.in = new SequenceFile.Reader(fs, path, conf); this.end = ((FileSplit)split).getStart() + split.getLength(); if (((FileSplit)split).getStart() > in.getPosition()) { in.sync(((FileSplit)split).getStart()); // sync to start } this.start = in.getPosition(); vbytes = in.createValueBytes(); done = start >= end; }
Example 17
Source File: CompositeInputSplit.java From hadoop with Apache License 2.0 | 5 votes |
/** * Add an InputSplit to this collection. * @throws IOException If capacity was not specified during construction * or if capacity has been reached. */ public void add(InputSplit s) throws IOException, InterruptedException { if (null == splits) { throw new IOException("Uninitialized InputSplit"); } if (fill == splits.length) { throw new IOException("Too many splits"); } splits[fill++] = s; totsize += s.getLength(); }
Example 18
Source File: TabletSplitSplit.java From datawave with Apache License 2.0 | 5 votes |
/** * Add an InputSplit to this collection. * * @throws IOException * If capacity was not specified during construction or if capacity has been reached. * @throws InterruptedException */ public void add(InputSplit s) throws IOException, InterruptedException { if (null == splits) { throw new IOException("Uninitialized InputSplit"); } if (fill == splits.length) { throw new IOException("Too many splits"); } splits[fill++] = s; totsize += s.getLength(); }
Example 19
Source File: WikiLoader.java From marklogic-contentpump with Apache License 2.0 | 4 votes |
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { Path file = ((FileSplit)inSplit).getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(file); byte[] buf = new byte[BUFFER_SIZE]; long bytesTotal = inSplit.getLength(); long start = ((FileSplit)inSplit).getStart(); fileIn.seek(start); long bytesRead = 0; StringBuilder pages = new StringBuilder(); int sindex = -1; while (true) { int length = (int)Math.min(bytesTotal - bytesRead, buf.length); int read = fileIn.read(buf, 0, length); if (read == -1) { System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead); break; } bytesRead += read; String temp = new String(new String(buf, 0, read)); if (sindex == -1) { // haven't found the start yet sindex = temp.indexOf(BEGIN_PAGE_TAG); if (sindex > -1) { pages.append(temp.substring(sindex)); } } else if (bytesRead < bytesTotal) { // haven't completed the split pages.append(temp); } else { // reached the end of this split // look for end int eindex = 0; if (temp.contains(END_DOC_TAG) || // reached the end of doc temp.endsWith(END_PAGE_TAG)) { eindex = temp.lastIndexOf(END_PAGE_TAG); pages.append(temp.substring(0, eindex + END_PAGE_TAG.length())); System.out.println("Found end of doc."); } else { // need to read ahead to look for end of page while (true) { read = fileIn.read(buf, 0, READ_AHEAD_SIZE); if (read == -1) { // no more to read System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead); System.out.println(temp); break; } bytesRead += read; // look for end temp = new String(buf, 0, read); eindex = temp.indexOf(END_PAGE_TAG); if (eindex > -1) { pages.append(temp.substring(0, eindex + END_PAGE_TAG.length())); break; } else { pages.append(temp); } } } break; } } fileIn.close(); articles = WikiModelProcessor.process(pages); }
Example 20
Source File: JobSplit.java From big-c with Apache License 2.0 | 4 votes |
public TaskSplitMetaInfo(InputSplit split, long startOffset) throws InterruptedException, IOException { this(new TaskSplitIndex("", startOffset), split.getLocations(), split.getLength()); }