org.apache.hadoop.mapreduce.InputSplit Java Exaples

Source File: TeraSortIngest.java From accumulo-examples with Apache License 2.0

7 votes

/**
 * Create the desired number of splits, dividing the number of rows between the mappers.
 */
@Override
public List<InputSplit> getSplits(JobContext job) {
  long totalRows = job.getConfiguration().getLong(NUMROWS, 0);
  int numSplits = job.getConfiguration().getInt(NUMSPLITS, 1);
  long rowsPerSplit = totalRows / numSplits;
  log.info(
      "Generating " + totalRows + " using " + numSplits + " maps with step of " + rowsPerSplit);
  ArrayList<InputSplit> splits = new ArrayList<>(numSplits);
  long currentRow = 0;
  for (int split = 0; split < numSplits - 1; ++split) {
    splits.add(new RangeInputSplit(currentRow, rowsPerSplit));
    currentRow += rowsPerSplit;
  }
  splits.add(new RangeInputSplit(currentRow, totalRows - currentRow));
  log.info("Done Generating.");
  return splits;
}

Source File: SMInputFormat.java From spliceengine with GNU Affero General Public License v3.0

6 votes

public SMRecordReaderImpl getRecordReader(InputSplit split, Configuration config) throws IOException,
        InterruptedException {
    config.addResource(conf);
    if (LOG.isDebugEnabled())
        SpliceLogUtils.debug(LOG, "getRecordReader with table=%s, inputTable=%s," +
                "conglomerate=%s",
                table,
                config.get(TableInputFormat.INPUT_TABLE),
                config.get(MRConstants.SPLICE_INPUT_CONGLOMERATE));
    rr = new SMRecordReaderImpl(conf);
    if(table == null){
        TableName tableInfo = TableName.valueOf(config.get(TableInputFormat.INPUT_TABLE));
        PartitionFactory tableFactory=SIDriver.driver().getTableFactory();
        table = ((ClientPartition)tableFactory.getTable(tableInfo)).unwrapDelegate();
    }
    rr.setHTable(table);
    if (LOG.isDebugEnabled())
        SpliceLogUtils.debug(LOG, "returning record reader");
    return rr;
}

Source File: GenerateData.java From hadoop with Apache License 2.0

6 votes

@Override
public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
  final JobClient client =
    new JobClient(new JobConf(jobCtxt.getConfiguration()));
  ClusterStatus stat = client.getClusterStatus(true);
  final long toGen =
    jobCtxt.getConfiguration().getLong(GRIDMIX_GEN_BYTES, -1);
  if (toGen < 0) {
    throw new IOException("Invalid/missing generation bytes: " + toGen);
  }
  final int nTrackers = stat.getTaskTrackers();
  final long bytesPerTracker = toGen / nTrackers;
  final ArrayList<InputSplit> splits = new ArrayList<InputSplit>(nTrackers);
  final Pattern trackerPattern = Pattern.compile("tracker_([^:]*):.*");
  final Matcher m = trackerPattern.matcher("");
  for (String tracker : stat.getActiveTrackerNames()) {
    m.reset(tracker);
    if (!m.find()) {
      System.err.println("Skipping node: " + tracker);
      continue;
    }
    final String name = m.group(1);
    splits.add(new GenSplit(bytesPerTracker, new String[] { name }));
  }
  return splits;
}

Source File: DelimitedTextReader.java From marklogic-contentpump with Apache License 2.0

6 votes

protected void initParser(InputSplit inSplit) throws IOException,
    InterruptedException {
    fileIn = openFile(inSplit, true);
    if (fileIn == null) {
        return;
    }
    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
        generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
        if (generateId) {
            idGen = new IdGenerator(file.toUri().getPath() + "-"
                + ((FileSplit) inSplit).getStart());
        } else {
            uriId = 0;
        }
    }
    parser = new CSVParser(instream, CSVParserFormatter.
    		getFormat(delimiter, encapsulator, true,
    				true));
    parserIterator = parser.iterator();
}

Source File: TestFileInputFormat.java From big-c with Apache License 2.0

6 votes

@Test
public void testSplitLocationInfo() throws Exception {
  Configuration conf = getConfiguration();
  conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
      "test:///a1/a2");
  Job job = Job.getInstance(conf);
  TextInputFormat fileInputFormat = new TextInputFormat();
  List<InputSplit> splits = fileInputFormat.getSplits(job);
  String[] locations = splits.get(0).getLocations();
  Assert.assertEquals(2, locations.length);
  SplitLocationInfo[] locationInfo = splits.get(0).getLocationInfo();
  Assert.assertEquals(2, locationInfo.length);
  SplitLocationInfo localhostInfo = locations[0].equals("localhost") ?
      locationInfo[0] : locationInfo[1];
  SplitLocationInfo otherhostInfo = locations[0].equals("otherhost") ?
      locationInfo[0] : locationInfo[1];
  Assert.assertTrue(localhostInfo.isOnDisk());
  Assert.assertTrue(localhostInfo.isInMemory());
  Assert.assertTrue(otherhostInfo.isOnDisk());
  Assert.assertFalse(otherhostInfo.isInMemory());
}

Source File: CSVFileInputFormat.java From components with Apache License 2.0

6 votes

@Override
public CSVFileRecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
  String delimiter = context.getConfiguration().get(TALEND_ROW_DELIMITED);
  String encoding = context.getConfiguration().get(TALEND_ENCODING);

  String textEnclosure = context.getConfiguration().get(TALEND_TEXT_ENCLOSURE);
  String escapeChar = context.getConfiguration().get(TALEND_ESCAPE);

  Character te = null;
  Character ec = null;

  if (textEnclosure != null && !textEnclosure.isEmpty()) {
    te = textEnclosure.charAt(0);
  }

  if (escapeChar != null && !escapeChar.isEmpty()) {
    ec = escapeChar.charAt(0);
  }

  return createRecordReader(delimiter, encoding, te, ec);
}

Source File: HadoopElementIterator.java From tinkerpop with Apache License 2.0

6 votes

public HadoopElementIterator(final HadoopGraph graph) {
    try {
        this.graph = graph;
        final Configuration configuration = ConfUtil.makeHadoopConfiguration(this.graph.configuration());
        final InputFormat<NullWritable, VertexWritable> inputFormat = ConfUtil.getReaderAsInputFormat(configuration);
        if (inputFormat instanceof FileInputFormat) {
            final Storage storage = FileSystemStorage.open(configuration);
            if (!this.graph.configuration().containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION))
                return; // there is no input location and thus, no data (empty graph)
            if (!Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).isPresent())
                return; // there is no data at the input location (empty graph)
            configuration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).get());
        }
        final List<InputSplit> splits = inputFormat.getSplits(new JobContextImpl(configuration, new JobID(UUID.randomUUID().toString(), 1)));
        for (final InputSplit split : splits) {
            this.readers.add(inputFormat.createRecordReader(split, new TaskAttemptContextImpl(configuration, new TaskAttemptID())));
        }
    } catch (final Exception e) {
        throw new IllegalStateException(e.getMessage(), e);
    }
}

Source File: TestCRAMInputFormat.java From Hadoop-BAM with MIT License

6 votes

@Test
public void testReader() throws Exception {
  int expectedCount = 0;
  SamReader samReader = SamReaderFactory.makeDefault()
      .referenceSequence(new File(URI.create(reference))).open(new File(input));
  for (SAMRecord r : samReader) {
    expectedCount++;
  }

  AnySAMInputFormat inputFormat = new AnySAMInputFormat();
  List<InputSplit> splits = inputFormat.getSplits(jobContext);
  assertEquals(1, splits.size());
  RecordReader<LongWritable, SAMRecordWritable> reader = inputFormat
      .createRecordReader(splits.get(0), taskAttemptContext);
  reader.initialize(splits.get(0), taskAttemptContext);

  int actualCount = 0;
  while (reader.nextKeyValue()) {
    actualCount++;
  }

  assertEquals(expectedCount, actualCount);
}

Source File: BitcoinRawBlockFileInputFormat.java From hadoopcryptoledger with Apache License 2.0

5 votes

@Override
public RecordReader<BytesWritable,BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext ctx)  throws IOException {
	/** Create reader **/
	try {
		return new BitcoinRawBlockRecordReader(ctx.getConfiguration());
	} catch (HadoopCryptoLedgerConfigurationException e) {
		// log
		LOG.error(e);
	}
	return null;
}

Source File: MapReduceDataStore.java From geowave with Apache License 2.0

5 votes

public List<InputSplit> getSplits(
CommonQueryOptions commonOptions,
DataTypeQueryOptions<?> typeOptions,
IndexQueryOptions indexOptions,
QueryConstraints constraints,
TransientAdapterStore adapterStore,
AdapterIndexMappingStore aimStore,
DataStatisticsStore statsStore,
InternalAdapterStore internalAdapterStore,
IndexStore indexStore,
JobContext context,
Integer minSplits,
Integer maxSplits) throws IOException, InterruptedException;

Source File: TabletSplitSplit.java From datawave with Apache License 2.0

5 votes

/**
 * Collect a set of hosts from all child InputSplits.
 * 
 * @throws InterruptedException
 */
public String[] getLocations() throws IOException, InterruptedException {
    HashSet<String> hosts = new HashSet<>();
    for (InputSplit s : splits) {
        String[] hints = s.getLocations();
        if (hints != null && hints.length > 0) {
            Collections.addAll(hosts, hints);
        }
    }
    return hosts.toArray(new String[hosts.size()]);
}

Source File: BitcoinBlockFileInputFormat.java From hadoopcryptoledger with Apache License 2.0

5 votes

@Override
public RecordReader<BytesWritable,BitcoinBlock> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws IOException {	
	/** Create reader **/
	try {
		return new BitcoinBlockRecordReader(ctx.getConfiguration());
	} catch (HadoopCryptoLedgerConfigurationException e) {
		// log
		LOG.error(e);
	}
	return null;
}

Source File: DelimitedJSONReader.java From marklogic-contentpump with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    /* Initialization in super class */
    initConfig(context);  
    /*  Get file(s) in input split */
    setFile(((FileSplit) inSplit).getPath());
    // Initialize reader properties
    generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI,false);
    if (generateId){
        idGen = new IdGenerator(file.toUri().getPath() + "-"
                + ((FileSplit) inSplit).getStart()); 
    } else {
        uriName = conf.get(CONF_INPUT_URI_ID, null);
        mapper = new ObjectMapper();
    }
    bytesRead = 0;
    totalBytes = inSplit.getLength();
    /* Check file status */
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if (status.isDirectory()) {
        iterator = new FileIterator((FileSplit)inSplit, context);
        inSplit = iterator.next();
    }
    /* Initialize buffered reader */
    initFileStream(inSplit);
}

Source File: AbstractEthereumRecordReader.java From hadoopcryptoledger with Apache License 2.0

5 votes

/***
 * Initializes readers
 * 
 * @param split Split to be used (asssumed to be a file split)
 * œüaram context context of the job
 * @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
 * @throws java.lang.InterruptedException in case of thread interruption
 * 
 */

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
	   FileSplit fSplit = (FileSplit)split;
	   // Initialize start and end of split
	      start = fSplit.getStart();
	      end = start + fSplit.getLength();
	      final Path file = fSplit.getPath();
	      codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
	      final FileSystem fs = file.getFileSystem(context.getConfiguration());
	      FSDataInputStream fileIn = fs.open(file);
	      // open stream
	        if (isCompressedInput()) { // decompress
	        	decompressor = CodecPool.getDecompressor(codec);
	        	if (codec instanceof SplittableCompressionCodec) {
	  		
	          	final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, start, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
	  				ebr = new EthereumBlockReader(cIn, this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
	  				start = cIn.getAdjustedStart();
	         		end = cIn.getAdjustedEnd();
	          	filePosition = cIn; // take pos from compressed stream
	        } else {
	        	ebr = new EthereumBlockReader(codec.createInputStream(fileIn,decompressor), this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
	        	filePosition = fileIn;
	        }
	      } else {
	        fileIn.seek(start);
	        ebr = new EthereumBlockReader(fileIn, this.maxSizeEthereumBlock,this.bufferSize,this.useDirectBuffer);
	        filePosition = fileIn;
	      }
}

Source File: TeraGen.java From pravega-samples with Apache License 2.0

5 votes

/**
 * Create the desired number of splits, dividing the number of rows
 * between the mappers.
 */
public List<InputSplit> getSplits(JobContext job) {
  long totalRows = getNumberOfRows(job);
  int numSplits = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
  LOG.info("Generating " + totalRows + " using " + numSplits);
  List<InputSplit> splits = new ArrayList<InputSplit>();
  long currentRow = 0;
  for(int split = 0; split < numSplits; ++split) {
    long goal = 
      (long) Math.ceil(totalRows * (double)(split + 1) / numSplits);
    splits.add(new RangeInputSplit(currentRow, goal - currentRow));
    currentRow = goal;
  }
  return splits;
}

Source File: MergeDictionaryJob.java From kylin with Apache License 2.0

5 votes

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    int numMapTasks = jobContext.getConfiguration().getInt("num.map.tasks", 0);
    List<InputSplit> inputSplits = Lists.newArrayListWithCapacity(numMapTasks);

    for (int i = 0; i < numMapTasks; i++) {
        inputSplits.add(new IntInputSplit(i));
    }

    return inputSplits;
}

Source File: TeraGen.java From hadoop with Apache License 2.0

5 votes

/**
 * Create the desired number of splits, dividing the number of rows
 * between the mappers.
 */
public List<InputSplit> getSplits(JobContext job) {
  long totalRows = getNumberOfRows(job);
  int numSplits = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
  LOG.info("Generating " + totalRows + " using " + numSplits);
  List<InputSplit> splits = new ArrayList<InputSplit>();
  long currentRow = 0;
  for(int split = 0; split < numSplits; ++split) {
    long goal = 
      (long) Math.ceil(totalRows * (double)(split + 1) / numSplits);
    splits.add(new RangeInputSplit(currentRow, goal - currentRow));
    currentRow = goal;
  }
  return splits;
}

Source File: TestMRKeyValueTextInputFormat.java From big-c with Apache License 2.0

5 votes

/**
 * Test using the gzip codec for reading
 */
@Test
public void testGzip() throws IOException, InterruptedException {
  Configuration conf = new Configuration(defaultConf);
  CompressionCodec gzip = new GzipCodec();
  ReflectionUtils.setConf(gzip, conf);
  localFs.delete(workDir, true);
  writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, 
            "line-1\tthe quick\nline-2\tbrown\nline-3\t" +
            "fox jumped\nline-4\tover\nline-5\t the lazy\nline-6\t dog\n");
  writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip,
            "line-1\tthis is a test\nline-1\tof gzip\n");
  Job job = Job.getInstance(conf);
  FileInputFormat.setInputPaths(job, workDir);
  KeyValueTextInputFormat format = new KeyValueTextInputFormat();
  List<InputSplit> splits = format.getSplits(job);
  assertEquals("compressed splits == 2", 2, splits.size());
  FileSplit tmp = (FileSplit) splits.get(0);
  if (tmp.getPath().getName().equals("part2.txt.gz")) {
    splits.set(0, splits.get(1));
    splits.set(1, tmp);
  }
  List<Text> results = readSplit(format, splits.get(0), job);
  assertEquals("splits[0] length", 6, results.size());
  assertEquals("splits[0][0]", "the quick", results.get(0).toString());
  assertEquals("splits[0][1]", "brown", results.get(1).toString());
  assertEquals("splits[0][2]", "fox jumped", results.get(2).toString());
  assertEquals("splits[0][3]", "over", results.get(3).toString());
  assertEquals("splits[0][4]", " the lazy", results.get(4).toString());
  assertEquals("splits[0][5]", " dog", results.get(5).toString());
  results = readSplit(format, splits.get(1), job);
  assertEquals("splits[1] length", 2, results.size());
  assertEquals("splits[1][0]", "this is a test", 
               results.get(0).toString());    
  assertEquals("splits[1][1]", "of gzip", 
               results.get(1).toString());    
}

Source File: PageRankAlgorithm.java From rheem with Apache License 2.0

5 votes

@Override
public VertexReader<LongWritable, DoubleWritable,
        FloatWritable> createVertexReader(InputSplit split,
                                          TaskAttemptContext context)
        throws IOException {
    return new PageRankVertexReader();
}

Source File: AbstractEventRecordReader.java From datawave with Apache License 2.0

5 votes

@Override
public void initialize(final InputSplit genericSplit, final TaskAttemptContext context) throws IOException {
    initializeEvent(context.getConfiguration());
    
    if (genericSplit instanceof FileSplit) {
        final Path p = ((FileSplit) genericSplit).getPath();
        final FileSystem sys = p.getFileSystem(context.getConfiguration());
        
        rawFileName = p.toString();
        rawFileTimeStamp = sys.getFileStatus(p).getModificationTime();
    }
}

Source File: InputFormatHadoop.java From grakn with GNU Affero General Public License v3.0

5 votes

@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
    reader.initialize(inputSplit, taskAttemptContext);

    Configuration conf = taskAttemptContext.getConfiguration();
    if (conf.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null) {
        graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(conf),
                                                      Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    }
}

Source File: TeraInputFormat.java From incubator-tez with Apache License 2.0

5 votes

public void initialize(InputSplit split, TaskAttemptContext context) 
    throws IOException, InterruptedException {
  Path p = ((FileSplit)split).getPath();
  FileSystem fs = p.getFileSystem(context.getConfiguration());
  in = fs.open(p);
  long start = ((FileSplit)split).getStart();
  // find the offset to start at a record boundary
  offset = (RECORD_LENGTH - (start % RECORD_LENGTH)) % RECORD_LENGTH;
  in.seek(start + offset);
  length = ((FileSplit)split).getLength();
}

Source File: CombineShimRecordReader.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit curSplit, TaskAttemptContext curContext)
    throws IOException, InterruptedException {
  this.split = (CombineFileSplit) curSplit;
  this.context = curContext;

  if (null == rr) {
    createChildReader();
  }

  FileSplit fileSplit = new FileSplit(this.split.getPath(index),
      this.split.getOffset(index), this.split.getLength(index),
      this.split.getLocations());
  this.rr.initialize(fileSplit, this.context);
}

Source File: PhoenixInputFormat.java From phoenix with Apache License 2.0

5 votes

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {  
    final Configuration configuration = context.getConfiguration();
    final QueryPlan queryPlan = getQueryPlan(context,configuration);
    final List<KeyRange> allSplits = queryPlan.getSplits();
    final List<InputSplit> splits = generateSplits(queryPlan,allSplits);
    return splits;
}

Source File: HadoopSortingTest.java From ignite with Apache License 2.0

5 votes

/** {@inheritDoc} */
@Override public List<InputSplit> getSplits(JobContext ctx) throws IOException, InterruptedException {
    List<InputSplit> res = new ArrayList<>();

    FakeSplit split = new FakeSplit(20);

    for (int i = 0; i < 10; i++)
        res.add(split);

    return res;
}

Source File: NetezzaExternalTableInputFormat.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
    InterruptedException {
  int targetNumTasks = ConfigurationHelper.getJobNumMaps(context);
  List<InputSplit> splits = new ArrayList<InputSplit>(targetNumTasks);
  for (int i = 0; i < targetNumTasks; ++i) {
    splits.add(new NetezzaExternalTableInputSplit(i));
  }
  return splits;
}

Source File: NMapInputFormat.java From hbase with Apache License 2.0

5 votes

@Override
public List<InputSplit> getSplits(JobContext context) {
  int count = getNumMapTasks(context.getConfiguration());
  List<InputSplit> splits = new ArrayList<>(count);
  for (int i = 0; i < count; i++) {
    splits.add(new NullInputSplit());
  }
  return splits;
}

Source File: AvroRecordReader.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration conf = context.getConfiguration();
  SeekableInput in = new FsInput(split.getPath(), conf);
  DatumReader<T> datumReader = new GenericDatumReader<T>();
  this.reader = DataFileReader.openReader(in, datumReader);
  reader.sync(split.getStart());                    // sync to start
  this.start = reader.tell();
  this.end = split.getStart() + split.getLength();
}

Source File: MainframeDatasetInputFormat.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
  List<InputSplit> splits = new ArrayList<InputSplit>();
  Configuration conf = job.getConfiguration();
  String dsName
      = conf.get(MainframeConfiguration.MAINFRAME_INPUT_DATASET_NAME);
  LOG.info("Datasets to transfer from: " + dsName);
  List<String> datasets = retrieveDatasets(dsName, conf);
  if (datasets.isEmpty()) {
    throw new IOException ("No sequential datasets retrieved from " + dsName);
  } else {
    int count = datasets.size();
    int chunks = Math.min(count, ConfigurationHelper.getJobNumMaps(job));
    for (int i = 0; i < chunks; i++) {
      splits.add(new MainframeDatasetInputSplit());
    }

    int j = 0;
    while(j < count) {
      for (InputSplit sp : splits) {
        if (j == count) {
          break;
        }
        ((MainframeDatasetInputSplit)sp).addDataset(datasets.get(j));
        j++;
      }
    }
  }
  return splits;
}

Source File: MapContextImpl.java From incubator-tez with Apache License 2.0

5 votes

public MapContextImpl(Configuration conf, TaskAttemptID taskid,
                      RecordReader<KEYIN,VALUEIN> reader,
                      RecordWriter<KEYOUT,VALUEOUT> writer,
                      OutputCommitter committer,
                      TezTaskContext context,
                      InputSplit split, Reporter reporter) {
  super(conf, taskid, writer, committer, context, reporter);
  this.reader = reader;
  this.split = split;
}

org.apache.hadoop.mapreduce.InputSplit Java Examples