org.apache.hadoop.mapred.InputSplit Java Examples
The following examples show how to use
org.apache.hadoop.mapred.InputSplit.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveWarehouseDataSourceReader.java From spark-llap with Apache License 2.0 | 6 votes |
protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) { List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>(); try { JobConf jobConf = JobUtil.createJobConf(options, query); LlapBaseInputFormat llapInputFormat = new LlapBaseInputFormat(false, Long.MAX_VALUE); //numSplits arg not currently supported, use 1 as dummy arg InputSplit[] splits = llapInputFormat.getSplits(jobConf, 1); for (InputSplit split : splits) { tasks.add(getDataReaderFactory(split, jobConf, getArrowAllocatorMax())); } } catch (IOException e) { LOG.error("Unable to submit query to HS2"); throw new RuntimeException(e); } return tasks; }
Example #2
Source File: OrcInputFormat.java From hive-dwrf with Apache License 2.0 | 6 votes |
@Override public RecordReader<NullWritable, OrcLazyRow> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { ReaderWriterProfiler.setProfilerOptions(conf); FileSplit fileSplit = (FileSplit) inputSplit; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(conf); reporter.setStatus(fileSplit.toString()); return new OrcRecordReader( OrcFile.createReader(fs, path, conf), conf, fileSplit.getStart(), fileSplit.getLength() ); }
Example #3
Source File: ImportRecordReaderFactory.java From emr-dynamodb-connector with Apache License 2.0 | 6 votes |
static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader( InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException { // CombineFileSplit indicates the new export format which includes a manifest file if (inputSplit instanceof CombineFileSplit) { int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1); if (version != ExportManifestRecordWriter.FORMAT_VERSION) { throw new IOException("Unknown version: " + job.get(DynamoDBConstants .EXPORT_FORMAT_VERSION)); } return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter); } else if (inputSplit instanceof FileSplit) { // FileSplit indicates the old data pipeline format which doesn't include a manifest file Path path = ((FileSplit) inputSplit).getPath(); return new ImportRecordReader(job, path); } else { throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:" + " " + inputSplit.getClass()); } }
Example #4
Source File: HiveDynamoDBInputFormat.java From emr-dynamodb-connector with Apache License 2.0 | 6 votes |
@Override public RecordReader<Text, DynamoDBItemWritable> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { reporter.progress(); Map<String, String> columnMapping = HiveDynamoDBUtil.fromJsonString(conf.get(DynamoDBConstants.DYNAMODB_COLUMN_MAPPING)); Map<String, String> hiveTypeMapping = HiveDynamoDBUtil.extractHiveTypeMapping(conf); DynamoDBQueryFilter queryFilter = getQueryFilter(conf, columnMapping, hiveTypeMapping); DynamoDBSplit bbSplit = (DynamoDBSplit) split; bbSplit.setDynamoDBFilterPushdown(queryFilter); Collection<String> attributes = (columnMapping == null ? null : columnMapping.values()); DynamoDBRecordReaderContext context = buildHiveDynamoDBRecordReaderContext(bbSplit, conf, reporter, attributes); return new DefaultDynamoDBRecordReader(context); }
Example #5
Source File: AbstractEvaluatorToPartitionStrategy.java From reef with Apache License 2.0 | 6 votes |
/** * Allocates the first available split into the evaluator. * * @param evaluatorId * the evaluator id * @param value * the queue of splits * @return a numberedSplit or null if it cannot find one */ protected NumberedSplit<InputSplit> allocateSplit(final String evaluatorId, final BlockingQueue<NumberedSplit<InputSplit>> value) { if (value == null) { LOG.log(Level.FINE, "Queue of splits can't be empty. Returning null"); return null; } while (true) { final NumberedSplit<InputSplit> split = value.poll(); if (split == null) { return null; } if (value == unallocatedSplits || unallocatedSplits.remove(split)) { LOG.log(Level.FINE, "Found split-" + split.getIndex() + " in the queue"); final NumberedSplit<InputSplit> old = evaluatorToSplits.putIfAbsent(evaluatorId, split); if (old != null) { throw new RuntimeException("Trying to assign different splits to the same evaluator is not supported"); } else { LOG.log(Level.FINE, "Returning " + split.getIndex()); return split; } } } }
Example #6
Source File: FrameReaderTextCell.java From systemds with Apache License 2.0 | 6 votes |
protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException { if( fs.isDirectory(path) ) { FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); for(InputSplit split: splits) readTextCellFrameFromInputSplit(split, informat, job, dest); } else { readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen); } }
Example #7
Source File: MR4CInputFormatTest.java From mr4c with Apache License 2.0 | 6 votes |
private void doTest(List<List<String>> frameSplits, int overlapBefore, int overlapAfter, Integer chunkSize) throws Exception { AlgorithmConfig algoConfig = m_mgr.getExecutionSource().getAlgorithmConfig(); algoConfig.addDimension(new DimensionConfig("frame", true, overlapBefore, overlapAfter, null, chunkSize, false)); algoConfig.addDimension(new DimensionConfig("type", false, 0, 0, null, null, false)); Set<Set<DataKey>> expectedKeySplits = buildExpectedSplits(frameSplits); MR4CInputFormat format = new MR4CInputFormat(); InputSplit[] splits = format.getSplits( m_mgr.getExecutionSource(), 4); Set<Set<DataKey>> actualKeySplits= new HashSet<Set<DataKey>>(); for ( InputSplit split : splits ) { MR4CInputSplit bbSplit = (MR4CInputSplit) split; actualKeySplits.add(new HashSet<DataKey>(bbSplit.getKeys().getKeys())); } assertEquals(expectedKeySplits, actualKeySplits); }
Example #8
Source File: HadoopInputFormatTest.java From flink with Apache License 2.0 | 6 votes |
@Test public void testOpenWithConfigurableReader() throws Exception { ConfigurableDummyRecordReader recordReader = mock(ConfigurableDummyRecordReader.class); DummyInputFormat inputFormat = mock(DummyInputFormat.class); when(inputFormat.getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class))).thenReturn(recordReader); HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat, String.class, Long.class, new JobConf()); hadoopInputFormat.open(getHadoopInputSplit()); verify(inputFormat, times(1)).getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class)); verify(recordReader, times(1)).setConf(any(JobConf.class)); verify(recordReader, times(1)).createKey(); verify(recordReader, times(1)).createValue(); assertThat(hadoopInputFormat.fetched, is(false)); }
Example #9
Source File: HadoopInputFormatTest.java From flink with Apache License 2.0 | 6 votes |
@Test public void testOpenClose() throws Exception { DummyRecordReader recordReader = mock(DummyRecordReader.class); DummyInputFormat inputFormat = mock(DummyInputFormat.class); when(inputFormat.getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class))).thenReturn(recordReader); HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat, String.class, Long.class, new JobConf()); hadoopInputFormat.open(getHadoopInputSplit()); verify(inputFormat, times(1)).getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class)); verify(recordReader, times(1)).createKey(); verify(recordReader, times(1)).createValue(); assertThat(hadoopInputFormat.fetched, is(false)); hadoopInputFormat.close(); verify(recordReader, times(1)).close(); }
Example #10
Source File: HdfsDataFragmenter.java From pxf with Apache License 2.0 | 6 votes |
protected List<InputSplit> getSplits(Path path) throws IOException { PxfInputFormat pxfInputFormat = new PxfInputFormat(); PxfInputFormat.setInputPaths(jobConf, path); InputSplit[] splits = pxfInputFormat.getSplits(jobConf, 1); List<InputSplit> result = new ArrayList<>(); /* * HD-2547: If the file is empty, an empty split is returned: no * locations and no length. */ if (splits != null) { for (InputSplit split : splits) { if (split.getLength() > 0) { result.add(split); } } } return result; }
Example #11
Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0 | 6 votes |
@Test public void readEthereumBlockInputFormatBlock3346406() throws IOException, EthereumBlockReadException, ParseException, InterruptedException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName="eth3346406.bin"; String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile(); Path file = new Path(fileNameBlock); FileInputFormat.setInputPaths(job, file); EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job,1); assertEquals( 1, inputSplits.length,"Only one split generated for genesis block"); RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull( reader,"Format returned null RecordReader"); BytesWritable key = new BytesWritable(); EthereumBlock block = new EthereumBlock(); assertTrue( reader.next(key,block),"Input Split for block 3346406 contains at least one block"); assertEquals( 7, block.getEthereumTransactions().size(),"Block 3346406 must have 7 transactions"); assertFalse( reader.next(key,block),"No further blocks in block 3346406"); reader.close(); }
Example #12
Source File: SplittableXmlInputFormat.java From Hive-XML-SerDe with Apache License 2.0 | 5 votes |
@Override public RecordReader<LongWritable, Text> getRecordReader(InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException { InputStream inputStream = null; try { inputStream = getInputStream(job, (FileSplit) inputSplit); } catch (ClassNotFoundException e) { e.printStackTrace(); } long start = ((FileSplit) inputSplit).getStart(); long end = start + inputSplit.getLength(); return new HiveXmlRecordReader(job, inputStream, start, end); }
Example #13
Source File: StormParsedInputFormat.java From incubator-retired-mrql with Apache License 2.0 | 5 votes |
@Override public RecordReader<MRContainer, MRContainer> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { StormEvaluator.load_source_dir(); // load the parsed source parameters from a file String path = ((FileSplit)split).getPath().toString(); ParsedDataSource ds = (ParsedDataSource)DataSource.get(path,Plan.conf); return new ParsedRecordReader((FileSplit)split,job,ds.parser,(Trees)ds.args); }
Example #14
Source File: TezGroupedSplit.java From incubator-tez with Apache License 2.0 | 5 votes |
public void addSplit(InputSplit split) { wrappedSplits.add(split); try { length += split.getLength(); } catch (Exception e) { throw new TezUncheckedException(e); } }
Example #15
Source File: ParquetRecordReaderWrapper.java From parquet-mr with Apache License 2.0 | 5 votes |
public ParquetRecordReaderWrapper( final ParquetInputFormat<ArrayWritable> newInputFormat, final InputSplit oldSplit, final JobConf oldJobConf, final Reporter reporter) throws IOException, InterruptedException { this(newInputFormat, oldSplit, oldJobConf, reporter, (new HiveBindingFactory()).create()); }
Example #16
Source File: DBInputFormat.java From hadoop-gpu with Apache License 2.0 | 5 votes |
/** {@inheritDoc} */ @SuppressWarnings("unchecked") public RecordReader<LongWritable, T> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { Class inputClass = dbConf.getInputClass(); try { return new DBRecordReader((DBInputSplit) split, inputClass, job); } catch (SQLException ex) { throw new IOException(ex.getMessage()); } }
Example #17
Source File: MRInputHelpers.java From tez with Apache License 2.0 | 5 votes |
@SuppressWarnings({ "rawtypes", "unchecked" }) private static org.apache.hadoop.mapred.InputSplit[] generateOldSplits( JobConf jobConf, boolean groupSplits, boolean sortSplits, int numTasks) throws IOException { // This is the real InputFormat org.apache.hadoop.mapred.InputFormat inputFormat; try { inputFormat = jobConf.getInputFormat(); } catch (Exception e) { throw new TezUncheckedException(e); } org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat; if (groupSplits) { org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat = new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat(); groupedFormat.setConf(jobConf); groupedFormat.setInputFormat(inputFormat); groupedFormat.setDesiredNumberOfSplits(numTasks); finalInputFormat = groupedFormat; } else { finalInputFormat = inputFormat; } org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat .getSplits(jobConf, jobConf.getNumMapTasks()); if (sortSplits) { // sort the splits into order based on size, so that the biggest // go first Arrays.sort(splits, new OldInputSplitComparator()); } return splits; }
Example #18
Source File: BinaryProtocol.java From hadoop with Apache License 2.0 | 5 votes |
public void runMap(InputSplit split, int numReduces, boolean pipedInput) throws IOException { WritableUtils.writeVInt(stream, MessageType.RUN_MAP.code); writeObject(split); WritableUtils.writeVInt(stream, numReduces); WritableUtils.writeVInt(stream, pipedInput ? 1 : 0); }
Example #19
Source File: InputFormatGrakn.java From grakn with GNU Affero General Public License v3.0 | 5 votes |
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { TaskAttemptContext tac = HadoopCompat.newTaskAttemptContext(jobConf, new TaskAttemptID()); List<org.apache.hadoop.mapreduce.InputSplit> newInputSplits = this.getSplits(tac); InputSplit[] oldInputSplits = new InputSplit[newInputSplits.size()]; for (int i = 0; i < newInputSplits.size(); i++) { oldInputSplits[i] = (ColumnFamilySplit) newInputSplits.get(i); } return oldInputSplits; }
Example #20
Source File: CompositeInputSplit.java From big-c with Apache License 2.0 | 5 votes |
/** * Collect a set of hosts from all child InputSplits. */ public String[] getLocations() throws IOException { HashSet<String> hosts = new HashSet<String>(); for (InputSplit s : splits) { String[] hints = s.getLocations(); if (hints != null && hints.length > 0) { for (String host : hints) { hosts.add(host); } } } return hosts.toArray(new String[hosts.size()]); }
Example #21
Source File: LoadGeneratorMR.java From hadoop with Apache License 2.0 | 5 votes |
public RecordReader<LongWritable, Text> getRecordReader( InputSplit ignored, JobConf conf, Reporter reporter) throws IOException { return new RecordReader<LongWritable, Text>() { boolean sentOneRecord = false; public boolean next(LongWritable key, Text value) throws IOException { key.set(1); value.set("dummy"); if (sentOneRecord == false) { // first call sentOneRecord = true; return true; } return false; // we have sent one record - we are done } public LongWritable createKey() { return new LongWritable(); } public Text createValue() { return new Text(); } public long getPos() throws IOException { return 1; } public void close() throws IOException { } public float getProgress() throws IOException { return 1; } }; }
Example #22
Source File: DummyInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
public RecordReader<Object, Object> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { return new RecordReader<Object, Object>() { boolean once = false; public boolean next(Object key, Object value) throws IOException { if (!once) { once = true; return true; } return false; } public Object createKey() { return new Object(); } public Object createValue() { return new Object(); } public long getPos() throws IOException { return 0L; } public void close() throws IOException { } public float getProgress() throws IOException { return 0.0f; } }; }
Example #23
Source File: TezGroupedSplit.java From incubator-tez with Apache License 2.0 | 5 votes |
public TezGroupedSplit(int numSplits, String wrappedInputFormatName, String[] locations, String rack) { this.wrappedSplits = new ArrayList<InputSplit>(numSplits); this.wrappedInputFormatName = wrappedInputFormatName; this.locations = locations; this.rack = rack; }
Example #24
Source File: GFInputFormat.java From gemfirexd-oss with Apache License 2.0 | 5 votes |
@Override public RecordReader<GFKey, PersistedEventImpl> getRecordReader( InputSplit split, JobConf job, Reporter reporter) throws IOException { CombineFileSplit cSplit = (CombineFileSplit) split; AbstractGFRecordReader reader = new AbstractGFRecordReader(); reader.initialize(cSplit, job); return reader; }
Example #25
Source File: BackgroundHiveSplitLoader.java From presto with Apache License 2.0 | 5 votes |
private ListenableFuture<?> addSplitsToSource(InputSplit[] targetSplits, InternalHiveSplitFactory splitFactory) throws IOException { ListenableFuture<?> lastResult = COMPLETED_FUTURE; for (InputSplit inputSplit : targetSplits) { Optional<InternalHiveSplit> internalHiveSplit = splitFactory.createInternalHiveSplit((FileSplit) inputSplit); if (internalHiveSplit.isPresent()) { lastResult = hiveSplitSource.addToQueue(internalHiveSplit.get()); } if (stopped) { return COMPLETED_FUTURE; } } return lastResult; }
Example #26
Source File: MRReaderMapred.java From incubator-tez with Apache License 2.0 | 5 votes |
public MRReaderMapred(JobConf jobConf, InputSplit inputSplit, TezCounters tezCounters, TezCounter inputRecordCounter) throws IOException { this.jobConf = jobConf; this.tezCounters = tezCounters; this.inputRecordCounter = inputRecordCounter; inputFormat = this.jobConf.getInputFormat(); if (inputSplit != null) { this.inputSplit = inputSplit; setupOldRecordReader(); } }
Example #27
Source File: DelegatingInputFormat.java From RDFS with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { // Find the InputFormat and then the RecordReader from the // TaggedInputSplit. TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split; InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils .newInstance(taggedInputSplit.getInputFormatClass(), conf); return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf, reporter); }
Example #28
Source File: ReaderTextCellParallel.java From systemds with Apache License 2.0 | 5 votes |
public ReadTask( InputSplit split, TextInputFormat informat, JobConf job, MatrixBlock dest, long rlen, long clen, boolean mm, FileFormatPropertiesMM mmProps ) { _split = split; _sparse = dest.isInSparseFormat(); _informat = informat; _job = job; _dest = dest; _rlen = rlen; _clen = clen; _matrixMarket = mm; _mmProps = mmProps; }
Example #29
Source File: DBInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
/** {@inheritDoc} */ public InputSplit[] getSplits(JobConf job, int chunks) throws IOException { List<org.apache.hadoop.mapreduce.InputSplit> newSplits = super.getSplits(Job.getInstance(job)); InputSplit[] ret = new InputSplit[newSplits.size()]; int i = 0; for (org.apache.hadoop.mapreduce.InputSplit s : newSplits) { org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit split = (org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit)s; ret[i++] = new DBInputSplit(split.getStart(), split.getEnd()); } return ret; }
Example #30
Source File: TestInputOutputFormat.java From hive-dwrf with Apache License 2.0 | 5 votes |
@Test public void testEmptyFile() throws Exception { JobConf job = new JobConf(conf); Properties properties = new Properties(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); writer.close(true); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); SerDe serde = new OrcSerde(); serde.initialize(conf, properties); InputFormat<?,?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // read the whole file conf.set("hive.io.file.readcolumn.ids", "0,1"); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); assertEquals(0.0, reader.getProgress(), 0.00001); assertEquals(0, reader.getPos()); assertEquals(false, reader.next(key, value)); reader.close(); assertEquals(null, serde.getSerDeStats()); }