org.apache.hadoop.mapred.InputSplit Java Exaples

Source File: HiveWarehouseDataSourceReader.java From spark-llap with Apache License 2.0

6 votes

protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) {
  List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>();
  try {
    JobConf jobConf = JobUtil.createJobConf(options, query);
    LlapBaseInputFormat llapInputFormat = new LlapBaseInputFormat(false, Long.MAX_VALUE);
    //numSplits arg not currently supported, use 1 as dummy arg
    InputSplit[] splits = llapInputFormat.getSplits(jobConf, 1);
    for (InputSplit split : splits) {
      tasks.add(getDataReaderFactory(split, jobConf, getArrowAllocatorMax()));
    }
  } catch (IOException e) {
    LOG.error("Unable to submit query to HS2");
    throw new RuntimeException(e);
  }
  return tasks;
}

Source File: OrcInputFormat.java From hive-dwrf with Apache License 2.0

6 votes

@Override
public RecordReader<NullWritable, OrcLazyRow>
    getRecordReader(InputSplit inputSplit, JobConf conf,
                    Reporter reporter) throws IOException {
  ReaderWriterProfiler.setProfilerOptions(conf);
  FileSplit fileSplit = (FileSplit) inputSplit;
  Path path = fileSplit.getPath();
  FileSystem fs = path.getFileSystem(conf);
  reporter.setStatus(fileSplit.toString());

  return new OrcRecordReader(
      OrcFile.createReader(fs, path, conf),
      conf,
      fileSplit.getStart(),
      fileSplit.getLength()
  );
}

Source File: ImportRecordReaderFactory.java From emr-dynamodb-connector with Apache License 2.0

6 votes

static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}

Source File: HiveDynamoDBInputFormat.java From emr-dynamodb-connector with Apache License 2.0

6 votes

@Override
public RecordReader<Text, DynamoDBItemWritable> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws
    IOException {
  reporter.progress();

  Map<String, String> columnMapping =
      HiveDynamoDBUtil.fromJsonString(conf.get(DynamoDBConstants.DYNAMODB_COLUMN_MAPPING));
  Map<String, String> hiveTypeMapping = HiveDynamoDBUtil.extractHiveTypeMapping(conf);
  DynamoDBQueryFilter queryFilter = getQueryFilter(conf, columnMapping, hiveTypeMapping);
  DynamoDBSplit bbSplit = (DynamoDBSplit) split;
  bbSplit.setDynamoDBFilterPushdown(queryFilter);

  Collection<String> attributes = (columnMapping == null ? null : columnMapping.values());
  DynamoDBRecordReaderContext context = buildHiveDynamoDBRecordReaderContext(bbSplit, conf,
      reporter, attributes);
  return new DefaultDynamoDBRecordReader(context);
}

Source File: AbstractEvaluatorToPartitionStrategy.java From reef with Apache License 2.0

6 votes

/**
 * Allocates the first available split into the evaluator.
 *
 * @param evaluatorId
 *          the evaluator id
 * @param value
 *          the queue of splits
 * @return a numberedSplit or null if it cannot find one
 */
protected NumberedSplit<InputSplit> allocateSplit(final String evaluatorId,
    final BlockingQueue<NumberedSplit<InputSplit>> value) {
  if (value == null) {
    LOG.log(Level.FINE, "Queue of splits can't be empty. Returning null");
    return null;
  }
  while (true) {
    final NumberedSplit<InputSplit> split = value.poll();
    if (split == null) {
      return null;
    }
    if (value == unallocatedSplits || unallocatedSplits.remove(split)) {
      LOG.log(Level.FINE, "Found split-" + split.getIndex() + " in the queue");
      final NumberedSplit<InputSplit> old = evaluatorToSplits.putIfAbsent(evaluatorId, split);
      if (old != null) {
        throw new RuntimeException("Trying to assign different splits to the same evaluator is not supported");
      } else {
        LOG.log(Level.FINE, "Returning " + split.getIndex());
        return split;
      }
    }
  }
}

Source File: FrameReaderTextCell.java From systemds with Apache License 2.0

6 votes

protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, 
		ValueType[] schema, String[] names, long rlen, long clen)
	throws IOException
{
	if( fs.isDirectory(path) ) {
		FileInputFormat.addInputPath(job, path);
		TextInputFormat informat = new TextInputFormat();
		informat.configure(job);
		InputSplit[] splits = informat.getSplits(job, 1);
		for(InputSplit split: splits)
			readTextCellFrameFromInputSplit(split, informat, job, dest);
	}
	else {
		readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen);
	}
}

Source File: MR4CInputFormatTest.java From mr4c with Apache License 2.0

6 votes

private void doTest(List<List<String>> frameSplits, int overlapBefore, int overlapAfter, Integer chunkSize) throws Exception {
	AlgorithmConfig algoConfig = m_mgr.getExecutionSource().getAlgorithmConfig();
	algoConfig.addDimension(new DimensionConfig("frame", true, overlapBefore, overlapAfter, null, chunkSize, false));
	algoConfig.addDimension(new DimensionConfig("type", false, 0, 0, null, null, false));

	Set<Set<DataKey>> expectedKeySplits = buildExpectedSplits(frameSplits);
	MR4CInputFormat format = new MR4CInputFormat();
	InputSplit[] splits = format.getSplits( m_mgr.getExecutionSource(), 4);

	Set<Set<DataKey>> actualKeySplits=  new HashSet<Set<DataKey>>();
	for ( InputSplit split : splits ) {
		MR4CInputSplit bbSplit = (MR4CInputSplit) split;
		actualKeySplits.add(new HashSet<DataKey>(bbSplit.getKeys().getKeys()));
	}
	assertEquals(expectedKeySplits, actualKeySplits);
}

Source File: HadoopInputFormatTest.java From flink with Apache License 2.0

6 votes

@Test
public void testOpenWithConfigurableReader() throws Exception {
	ConfigurableDummyRecordReader recordReader = mock(ConfigurableDummyRecordReader.class);
	DummyInputFormat inputFormat = mock(DummyInputFormat.class);
	when(inputFormat.getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class))).thenReturn(recordReader);

	HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat, String.class, Long.class, new JobConf());
	hadoopInputFormat.open(getHadoopInputSplit());

	verify(inputFormat, times(1)).getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class));
	verify(recordReader, times(1)).setConf(any(JobConf.class));
	verify(recordReader, times(1)).createKey();
	verify(recordReader, times(1)).createValue();

	assertThat(hadoopInputFormat.fetched, is(false));

}

Source File: HadoopInputFormatTest.java From flink with Apache License 2.0

6 votes

@Test
public void testOpenClose() throws Exception {
	DummyRecordReader recordReader = mock(DummyRecordReader.class);
	DummyInputFormat inputFormat = mock(DummyInputFormat.class);
	when(inputFormat.getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class))).thenReturn(recordReader);

	HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat, String.class, Long.class, new JobConf());
	hadoopInputFormat.open(getHadoopInputSplit());

	verify(inputFormat, times(1)).getRecordReader(any(InputSplit.class), any(JobConf.class), any(Reporter.class));
	verify(recordReader, times(1)).createKey();
	verify(recordReader, times(1)).createValue();

	assertThat(hadoopInputFormat.fetched, is(false));

	hadoopInputFormat.close();
	verify(recordReader, times(1)).close();
}

Source File: HdfsDataFragmenter.java From pxf with Apache License 2.0

6 votes

protected List<InputSplit> getSplits(Path path) throws IOException {
    PxfInputFormat pxfInputFormat = new PxfInputFormat();
    PxfInputFormat.setInputPaths(jobConf, path);
    InputSplit[] splits = pxfInputFormat.getSplits(jobConf, 1);
    List<InputSplit> result = new ArrayList<>();

    /*
     * HD-2547: If the file is empty, an empty split is returned: no
     * locations and no length.
     */
    if (splits != null) {
        for (InputSplit split : splits) {
            if (split.getLength() > 0) {
                result.add(split);
            }
        }
    }

    return result;
}

Source File: EthereumFormatHadoopTest.java From hadoopcryptoledger with Apache License 2.0

6 votes

@Test
 public void readEthereumBlockInputFormatBlock3346406() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
JobConf job = new JobConf(defaultConf);
ClassLoader classLoader = getClass().getClassLoader();
String fileName="eth3346406.bin";
String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
Path file = new Path(fileNameBlock);
   FileInputFormat.setInputPaths(job, file);
   EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
   format.configure(job);
   InputSplit[] inputSplits = format.getSplits(job,1);
 
   assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   	RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
assertNotNull( reader,"Format returned  null RecordReader");

BytesWritable key = new BytesWritable();	
EthereumBlock block = new EthereumBlock();
assertTrue( reader.next(key,block),"Input Split for block 3346406 contains at least one block");
assertEquals( 7, block.getEthereumTransactions().size(),"Block 3346406 must have 7 transactions");
   	assertFalse( reader.next(key,block),"No further blocks in block 3346406");
   	reader.close();
}

Source File: SplittableXmlInputFormat.java From Hive-XML-SerDe with Apache License 2.0

5 votes

@Override
public RecordReader<LongWritable, Text> getRecordReader(InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {

    InputStream inputStream = null;
    try {
        inputStream = getInputStream(job, (FileSplit) inputSplit);
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    }
    long start = ((FileSplit) inputSplit).getStart();
    long end = start + inputSplit.getLength();

    return new HiveXmlRecordReader(job, inputStream, start, end);
}

Source File: StormParsedInputFormat.java From incubator-retired-mrql with Apache License 2.0

5 votes

@Override
public RecordReader<MRContainer, MRContainer> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    StormEvaluator.load_source_dir();  // load the parsed source parameters from a file
    String path = ((FileSplit)split).getPath().toString();
    ParsedDataSource ds = (ParsedDataSource)DataSource.get(path,Plan.conf);
    return new ParsedRecordReader((FileSplit)split,job,ds.parser,(Trees)ds.args);
}

Source File: TezGroupedSplit.java From incubator-tez with Apache License 2.0

5 votes

public void addSplit(InputSplit split) {
  wrappedSplits.add(split);
  try {
    length += split.getLength();
  } catch (Exception e) {
    throw new TezUncheckedException(e);
  }
}

Source File: ParquetRecordReaderWrapper.java From parquet-mr with Apache License 2.0

5 votes

public ParquetRecordReaderWrapper(
    final ParquetInputFormat<ArrayWritable> newInputFormat,
    final InputSplit oldSplit,
    final JobConf oldJobConf,
    final Reporter reporter)
        throws IOException, InterruptedException {
  this(newInputFormat, oldSplit, oldJobConf, reporter,
      (new HiveBindingFactory()).create());
}

Source File: DBInputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

/** {@inheritDoc} */
@SuppressWarnings("unchecked")
public RecordReader<LongWritable, T> getRecordReader(InputSplit split,
    JobConf job, Reporter reporter) throws IOException {

  Class inputClass = dbConf.getInputClass();
  try {
    return new DBRecordReader((DBInputSplit) split, inputClass, job);
  }
  catch (SQLException ex) {
    throw new IOException(ex.getMessage());
  }
}

Source File: MRInputHelpers.java From tez with Apache License 2.0

5 votes

@SuppressWarnings({ "rawtypes", "unchecked" })
private static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(
    JobConf jobConf, boolean groupSplits, boolean sortSplits, int numTasks)
    throws IOException {

  // This is the real InputFormat
  org.apache.hadoop.mapred.InputFormat inputFormat;
  try {
    inputFormat = jobConf.getInputFormat();
  } catch (Exception e) {
    throw new TezUncheckedException(e);
  }

  org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat;

  if (groupSplits) {
    org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat =
        new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat();
    groupedFormat.setConf(jobConf);
    groupedFormat.setInputFormat(inputFormat);
    groupedFormat.setDesiredNumberOfSplits(numTasks);
    finalInputFormat = groupedFormat;
  } else {
    finalInputFormat = inputFormat;
  }
  org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat
      .getSplits(jobConf, jobConf.getNumMapTasks());
  if (sortSplits) {
    // sort the splits into order based on size, so that the biggest
    // go first
    Arrays.sort(splits, new OldInputSplitComparator());
  }
  return splits;
}

Source File: BinaryProtocol.java From hadoop with Apache License 2.0

5 votes

public void runMap(InputSplit split, int numReduces, 
                   boolean pipedInput) throws IOException {
  WritableUtils.writeVInt(stream, MessageType.RUN_MAP.code);
  writeObject(split);
  WritableUtils.writeVInt(stream, numReduces);
  WritableUtils.writeVInt(stream, pipedInput ? 1 : 0);
}

Source File: InputFormatGrakn.java From grakn with GNU Affero General Public License v3.0

5 votes

public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    TaskAttemptContext tac = HadoopCompat.newTaskAttemptContext(jobConf, new TaskAttemptID());
    List<org.apache.hadoop.mapreduce.InputSplit> newInputSplits = this.getSplits(tac);
    InputSplit[] oldInputSplits = new InputSplit[newInputSplits.size()];
    for (int i = 0; i < newInputSplits.size(); i++) {
        oldInputSplits[i] = (ColumnFamilySplit) newInputSplits.get(i);
    }
    return oldInputSplits;
}

Source File: CompositeInputSplit.java From big-c with Apache License 2.0

5 votes

/**
 * Collect a set of hosts from all child InputSplits.
 */
public String[] getLocations() throws IOException {
  HashSet<String> hosts = new HashSet<String>();
  for (InputSplit s : splits) {
    String[] hints = s.getLocations();
    if (hints != null && hints.length > 0) {
      for (String host : hints) {
        hosts.add(host);
      }
    }
  }
  return hosts.toArray(new String[hosts.size()]);
}

Source File: LoadGeneratorMR.java From hadoop with Apache License 2.0

5 votes

public RecordReader<LongWritable, Text> getRecordReader(
    InputSplit ignored, JobConf conf, Reporter reporter) throws IOException {

  return new RecordReader<LongWritable, Text>() {

    boolean sentOneRecord = false;

    public boolean next(LongWritable key, Text value)
        throws IOException {
      key.set(1);
      value.set("dummy");
      if (sentOneRecord == false) { // first call
        sentOneRecord = true;
        return true;
      }
      return false; // we have sent one record - we are done
    }

    public LongWritable createKey() {
      return new LongWritable();
    }
    public Text createValue() {
      return new Text();
    }
    public long getPos() throws IOException {
      return 1;
    }
    public void close() throws IOException {
    }
    public float getProgress() throws IOException {
      return 1;
    }
  };
}

Source File: DummyInputFormat.java From hadoop with Apache License 2.0

5 votes

public RecordReader<Object, Object> getRecordReader(InputSplit split,
    JobConf job, Reporter reporter) throws IOException {
  return new RecordReader<Object, Object>() {

    boolean once = false;

    public boolean next(Object key, Object value) throws IOException {
      if (!once) {
        once = true;
        return true;
      }
      return false;
    }

    public Object createKey() {
      return new Object();
    }

    public Object createValue() {
      return new Object();
    }

    public long getPos() throws IOException {
      return 0L;
    }

    public void close() throws IOException {
    }

    public float getProgress() throws IOException {
      return 0.0f;
    }
  };
}

Source File: TezGroupedSplit.java From incubator-tez with Apache License 2.0

5 votes

public TezGroupedSplit(int numSplits, String wrappedInputFormatName,
    String[] locations, String rack) {
  this.wrappedSplits = new ArrayList<InputSplit>(numSplits);
  this.wrappedInputFormatName = wrappedInputFormatName;
  this.locations = locations;
  this.rack = rack;
}

Source File: GFInputFormat.java From gemfirexd-oss with Apache License 2.0

5 votes

@Override
public RecordReader<GFKey, PersistedEventImpl> getRecordReader(
    InputSplit split, JobConf job, Reporter reporter) throws IOException {

  CombineFileSplit cSplit = (CombineFileSplit) split;
  AbstractGFRecordReader reader = new AbstractGFRecordReader();
  reader.initialize(cSplit, job);
  return reader;
}

Source File: BackgroundHiveSplitLoader.java From presto with Apache License 2.0

5 votes

private ListenableFuture<?> addSplitsToSource(InputSplit[] targetSplits, InternalHiveSplitFactory splitFactory)
        throws IOException
{
    ListenableFuture<?> lastResult = COMPLETED_FUTURE;
    for (InputSplit inputSplit : targetSplits) {
        Optional<InternalHiveSplit> internalHiveSplit = splitFactory.createInternalHiveSplit((FileSplit) inputSplit);
        if (internalHiveSplit.isPresent()) {
            lastResult = hiveSplitSource.addToQueue(internalHiveSplit.get());
        }
        if (stopped) {
            return COMPLETED_FUTURE;
        }
    }
    return lastResult;
}

Source File: MRReaderMapred.java From incubator-tez with Apache License 2.0

5 votes

public MRReaderMapred(JobConf jobConf, InputSplit inputSplit, TezCounters tezCounters,
    TezCounter inputRecordCounter) throws IOException {
  this.jobConf = jobConf;
  this.tezCounters = tezCounters;
  this.inputRecordCounter = inputRecordCounter;
  inputFormat = this.jobConf.getInputFormat();
  if (inputSplit != null) {
    this.inputSplit = inputSplit;
    setupOldRecordReader();
  }
}

Source File: DelegatingInputFormat.java From RDFS with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
public RecordReader<K, V> getRecordReader(InputSplit split, JobConf conf,
    Reporter reporter) throws IOException {

  // Find the InputFormat and then the RecordReader from the
  // TaggedInputSplit.

  TaggedInputSplit taggedInputSplit = (TaggedInputSplit) split;
  InputFormat<K, V> inputFormat = (InputFormat<K, V>) ReflectionUtils
     .newInstance(taggedInputSplit.getInputFormatClass(), conf);
  return inputFormat.getRecordReader(taggedInputSplit.getInputSplit(), conf,
     reporter);
}

Source File: ReaderTextCellParallel.java From systemds with Apache License 2.0

5 votes

public ReadTask( InputSplit split, TextInputFormat informat, JobConf job, MatrixBlock dest, long rlen, long clen, boolean mm, FileFormatPropertiesMM mmProps ) {
	_split = split;
	_sparse = dest.isInSparseFormat();
	_informat = informat;
	_job = job;
	_dest = dest;
	_rlen = rlen;
	_clen = clen;
	_matrixMarket = mm;
	_mmProps = mmProps;
}

Source File: DBInputFormat.java From hadoop with Apache License 2.0

5 votes

/** {@inheritDoc} */
public InputSplit[] getSplits(JobConf job, int chunks) throws IOException {
  List<org.apache.hadoop.mapreduce.InputSplit> newSplits = 
    super.getSplits(Job.getInstance(job));
  InputSplit[] ret = new InputSplit[newSplits.size()];
  int i = 0;
  for (org.apache.hadoop.mapreduce.InputSplit s : newSplits) {
    org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit split = 
  	(org.apache.hadoop.mapreduce.lib.db.DBInputFormat.DBInputSplit)s;
    ret[i++] = new DBInputSplit(split.getStart(), split.getEnd());
  }
  return ret;
}

Source File: TestInputOutputFormat.java From hive-dwrf with Apache License 2.0

5 votes

@Test
public void testEmptyFile() throws Exception {
  JobConf job = new JobConf(conf);
  Properties properties = new Properties();
  HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat();
  FileSinkOperator.RecordWriter writer =
      outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true,
          properties, Reporter.NULL);
  writer.close(true);
  properties.setProperty("columns", "x,y");
  properties.setProperty("columns.types", "int:int");
  SerDe serde = new OrcSerde();
  serde.initialize(conf, properties);
  InputFormat<?,?> in = new OrcInputFormat();
  FileInputFormat.setInputPaths(conf, testFilePath.toString());
  InputSplit[] splits = in.getSplits(conf, 1);
  assertEquals(1, splits.length);

  // read the whole file
  conf.set("hive.io.file.readcolumn.ids", "0,1");
  org.apache.hadoop.mapred.RecordReader reader =
      in.getRecordReader(splits[0], conf, Reporter.NULL);
  Object key = reader.createKey();
  Object value = reader.createValue();
  assertEquals(0.0, reader.getProgress(), 0.00001);
  assertEquals(0, reader.getPos());
  assertEquals(false, reader.next(key, value));
  reader.close();
  assertEquals(null, serde.getSerDeStats());
}

org.apache.hadoop.mapred.InputSplit Java Examples