org.apache.hadoop.mapreduce.InputFormat#getSplits

Source File: AbstractHadoopJob.java From Kylin with Apache License 2.0

6 votes

protected double getTotalMapInputMB() throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    if (mapInputBytes == 0) {
        throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
    }
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Source File: StatementPatternStorageTest.java From rya with Apache License 2.0

6 votes

protected List<StatementPatternStorage> createStorages(final String location) throws IOException, InterruptedException {
    final List<StatementPatternStorage> storages = new ArrayList<StatementPatternStorage>();
    StatementPatternStorage storage = new StatementPatternStorage();
    final InputFormat<?, ?> inputFormat = storage.getInputFormat();
    Job job = Job.getInstance(new Configuration());
    storage.setLocation(location, job);
    final List<InputSplit> splits = inputFormat.getSplits(job);
    assertNotNull(splits);

    for (final InputSplit inputSplit : splits) {
        storage = new StatementPatternStorage();
        job = Job.getInstance(new Configuration());
        storage.setLocation(location, job);
        final TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(job.getConfiguration(),
                new TaskAttemptID("jtid", 0, TaskType.REDUCE, 0, 0));
        final RecordReader<?, ?> recordReader = inputFormat.createRecordReader(inputSplit,
                taskAttemptContext);
        recordReader.initialize(inputSplit, taskAttemptContext);

        storage.prepareToRead(recordReader, null);
        storages.add(storage);
    }
    return storages;
}

Source File: HadoopElementIterator.java From tinkerpop with Apache License 2.0

6 votes

public HadoopElementIterator(final HadoopGraph graph) {
    try {
        this.graph = graph;
        final Configuration configuration = ConfUtil.makeHadoopConfiguration(this.graph.configuration());
        final InputFormat<NullWritable, VertexWritable> inputFormat = ConfUtil.getReaderAsInputFormat(configuration);
        if (inputFormat instanceof FileInputFormat) {
            final Storage storage = FileSystemStorage.open(configuration);
            if (!this.graph.configuration().containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION))
                return; // there is no input location and thus, no data (empty graph)
            if (!Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).isPresent())
                return; // there is no data at the input location (empty graph)
            configuration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).get());
        }
        final List<InputSplit> splits = inputFormat.getSplits(new JobContextImpl(configuration, new JobID(UUID.randomUUID().toString(), 1)));
        for (final InputSplit split : splits) {
            this.readers.add(inputFormat.createRecordReader(split, new TaskAttemptContextImpl(configuration, new TaskAttemptID())));
        }
    } catch (final Exception e) {
        throw new IllegalStateException(e.getMessage(), e);
    }
}

Source File: AbstractHadoopJob.java From kylin with Apache License 2.0

6 votes

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Source File: AbstractHadoopJob.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Source File: TestMRSequenceFileInputFilter.java From big-c with Apache License 2.0

5 votes

private int countRecords(int numSplits) 
    throws IOException, InterruptedException {
  InputFormat<Text, BytesWritable> format =
    new SequenceFileInputFilter<Text, BytesWritable>();
  if (numSplits == 0) {
    numSplits =
      random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
  }
  FileInputFormat.setMaxInputSplitSize(job, 
    fs.getFileStatus(inFile).getLen() / numSplits);
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // check each split
  int count = 0;
  for (InputSplit split : format.getSplits(job)) {
    RecordReader<Text, BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<Text, BytesWritable, Text, BytesWritable> mcontext = 
      new MapContextImpl<Text, BytesWritable, Text, BytesWritable>(
      job.getConfiguration(), 
      context.getTaskAttemptID(), reader, null, null, 
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        LOG.info("Accept record " + reader.getCurrentKey().toString());
        count++;
      }
    } finally {
      reader.close();
    }
  }
  return count;
}

Source File: IndexedStorage.java From spork with Apache License 2.0

5 votes

/**
 * IndexableLoadFunc interface implementation
 */
@Override
public void initialize(Configuration conf) throws IOException {
    try {
        InputFormat inputFormat = this.getInputFormat();
        TaskAttemptID id = HadoopShims.getNewTaskAttemptID();

        if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
                    conf.set(MRConfiguration.JOB_CREDENTIALS_BINARY, System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
        }
        List<FileSplit> fileSplits = inputFormat.getSplits(HadoopShims.createJobContext(conf, null));
        this.readers = new IndexedStorageRecordReader[fileSplits.size()];

        int idx = 0;
        Iterator<FileSplit> it = fileSplits.iterator();
        while (it.hasNext()) {
            FileSplit fileSplit = it.next();
            TaskAttemptContext context = HadoopShims.createTaskAttemptContext(conf, id);
            IndexedStorageRecordReader r = (IndexedStorageRecordReader) inputFormat.createRecordReader(fileSplit, context);
            r.initialize(fileSplit, context);
            this.readers[idx] = r;
            idx++;
        }

        Arrays.sort(this.readers, this.readerComparator);
    } catch (InterruptedException e) {
        throw new IOException(e);
    }
}

Source File: InputSampler.java From big-c with Apache License 2.0

5 votes

/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      ++records;
      if ((double) kept / records < freq) {
        samples.add(ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
        ++kept;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Source File: InputSampler.java From big-c with Apache License 2.0

5 votes

/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                       reader.getCurrentKey(), null));
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Source File: InputSampler.java From hadoop with Apache License 2.0

5 votes

/**
 * For each split sampled, emit when the ratio of the number of records
 * retained to the total record count is less than the specified
 * frequency.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>();
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  long records = 0;
  long kept = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      ++records;
      if ((double) kept / records < freq) {
        samples.add(ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
        ++kept;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Source File: InputSampler.java From hadoop with Apache License 2.0

5 votes

/**
 * From each split sampled, take the first numSamples / numSplits records.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());
  int samplesPerSplit = numSamples / splitsToSample;
  long records = 0;
  for (int i = 0; i < splitsToSample; ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                       reader.getCurrentKey(), null));
      ++records;
      if ((i+1) * samplesPerSplit <= records) {
        break;
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Source File: TestMRSequenceFileInputFilter.java From hadoop with Apache License 2.0

5 votes

private int countRecords(int numSplits) 
    throws IOException, InterruptedException {
  InputFormat<Text, BytesWritable> format =
    new SequenceFileInputFilter<Text, BytesWritable>();
  if (numSplits == 0) {
    numSplits =
      random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
  }
  FileInputFormat.setMaxInputSplitSize(job, 
    fs.getFileStatus(inFile).getLen() / numSplits);
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // check each split
  int count = 0;
  for (InputSplit split : format.getSplits(job)) {
    RecordReader<Text, BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<Text, BytesWritable, Text, BytesWritable> mcontext = 
      new MapContextImpl<Text, BytesWritable, Text, BytesWritable>(
      job.getConfiguration(), 
      context.getTaskAttemptID(), reader, null, null, 
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        LOG.info("Accept record " + reader.getCurrentKey().toString());
        count++;
      }
    } finally {
      reader.close();
    }
  }
  return count;
}

Source File: TestMRSequenceFileAsBinaryInputFormat.java From big-c with Apache License 2.0

4 votes

public void testBinary() throws IOException, InterruptedException {
  Job job = Job.getInstance();
  FileSystem fs = FileSystem.getLocal(job.getConfiguration());
  Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
  Path file = new Path(dir, "testbinary.seq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  fs.delete(dir, true);
  FileInputFormat.setInputPaths(job, dir);

  Text tkey = new Text();
  Text tval = new Text();

  SequenceFile.Writer writer = new SequenceFile.Writer(fs,
    job.getConfiguration(), file, Text.class, Text.class);
  try {
    for (int i = 0; i < RECORDS; ++i) {
      tkey.set(Integer.toString(r.nextInt(), 36));
      tval.set(Long.toString(r.nextLong(), 36));
      writer.append(tkey, tval);
    }
  } finally {
    writer.close();
  }
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  InputFormat<BytesWritable,BytesWritable> bformat =
    new SequenceFileAsBinaryInputFormat();

  int count = 0;
  r.setSeed(seed);
  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();
  Text cmpkey = new Text();
  Text cmpval = new Text();
  DataInputBuffer buf = new DataInputBuffer();
  FileInputFormat.setInputPaths(job, file);
  for (InputSplit split : bformat.getSplits(job)) {
    RecordReader<BytesWritable, BytesWritable> reader =
          bformat.createRecordReader(split, context);
    MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<BytesWritable, BytesWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        bkey = reader.getCurrentKey();
        bval = reader.getCurrentValue();
        tkey.set(Integer.toString(r.nextInt(), 36));
        tval.set(Long.toString(r.nextLong(), 36));
        buf.reset(bkey.getBytes(), bkey.getLength());
        cmpkey.readFields(buf);
        buf.reset(bval.getBytes(), bval.getLength());
        cmpval.readFields(buf);
        assertTrue(
          "Keys don't match: " + "*" + cmpkey.toString() + ":" +
          tkey.toString() + "*",
          cmpkey.toString().equals(tkey.toString()));
        assertTrue(
          "Vals don't match: " + "*" + cmpval.toString() + ":" +
          tval.toString() + "*",
          cmpval.toString().equals(tval.toString()));
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}

Source File: TestMRSequenceFileAsBinaryOutputFormat.java From big-c with Apache License 2.0

4 votes

public void testBinary() throws IOException, InterruptedException {
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf);
  
  Path outdir = new Path(System.getProperty("test.build.data", "/tmp"),
                  "outseq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  FileOutputFormat.setOutputPath(job, outdir);

  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, 
                                        IntWritable.class );
  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, 
                                        DoubleWritable.class ); 

  SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
  SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, 
                                                     CompressionType.BLOCK);

  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();

  TaskAttemptContext context = 
    MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
  OutputFormat<BytesWritable, BytesWritable> outputFormat = 
    new SequenceFileAsBinaryOutputFormat();
  OutputCommitter committer = outputFormat.getOutputCommitter(context);
  committer.setupJob(job);
  RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.
    getRecordWriter(context);

  IntWritable iwritable = new IntWritable();
  DoubleWritable dwritable = new DoubleWritable();
  DataOutputBuffer outbuf = new DataOutputBuffer();
  LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
  try {
    for (int i = 0; i < RECORDS; ++i) {
      iwritable = new IntWritable(r.nextInt());
      iwritable.write(outbuf);
      bkey.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      dwritable = new DoubleWritable(r.nextDouble());
      dwritable.write(outbuf);
      bval.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      writer.write(bkey, bval);
    }
  } finally {
    writer.close(context);
  }
  committer.commitTask(context);
  committer.commitJob(job);

  InputFormat<IntWritable, DoubleWritable> iformat =
    new SequenceFileInputFormat<IntWritable, DoubleWritable>();
  int count = 0;
  r.setSeed(seed);
  SequenceFileInputFormat.setInputPaths(job, outdir);
  LOG.info("Reading data by SequenceFileInputFormat");
  for (InputSplit split : iformat.getSplits(job)) {
    RecordReader<IntWritable, DoubleWritable> reader =
      iformat.createRecordReader(split, context);
    MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<IntWritable, DoubleWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      int sourceInt;
      double sourceDouble;
      while (reader.nextKeyValue()) {
        sourceInt = r.nextInt();
        sourceDouble = r.nextDouble();
        iwritable = reader.getCurrentKey();
        dwritable = reader.getCurrentValue();
        assertEquals(
            "Keys don't match: " + "*" + iwritable.get() + ":" + 
                                         sourceInt + "*",
            sourceInt, iwritable.get());
        assertTrue(
            "Vals don't match: " + "*" + dwritable.get() + ":" +
                                         sourceDouble + "*",
            Double.compare(dwritable.get(), sourceDouble) == 0 );
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}

Source File: TestCombineSequenceFileInputFormat.java From big-c with Apache License 2.0

4 votes

@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
  Job job = Job.getInstance(conf);

  Random random = new Random();
  long seed = random.nextLong();
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with a variety of lengths
  createFiles(length, numFiles, random, job);

  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // create a combine split for the files
  InputFormat<IntWritable,BytesWritable> format =
    new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
  for (int i = 0; i < 3; i++) {
    int numSplits =
      random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    RecordReader<IntWritable,BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
      new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());

    try {
      while (reader.nextKeyValue()) {
        IntWritable key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();
        assertNotNull("Value should not be null.", value);
        final int k = key.get();
        LOG.debug("read " + k);
        assertFalse("Key in multiple partitions.", bits.get(k));
        bits.set(k);
      }
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}

Source File: InputSampler.java From big-c with Apache License 2.0

4 votes

/**
 * Randomize the split order, then take the specified number of keys from
 * each split sampled, where each key is selected with the specified
 * probability and possibly replaced by a subsequently selected key when
 * the quota of keys from that split is satisfied.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());

  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);
  LOG.debug("seed: " + seed);
  // shuffle splits
  for (int i = 0; i < splits.size(); ++i) {
    InputSplit tmp = splits.get(i);
    int j = r.nextInt(splits.size());
    splits.set(i, splits.get(j));
    splits.set(j, tmp);
  }
  // our target rate is in terms of the maximum number of sample splits,
  // but we accept the possibility of sampling additional splits to hit
  // the target sample keyset
  for (int i = 0; i < splitsToSample ||
                 (i < splits.size() && samples.size() < numSamples); ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      if (r.nextDouble() <= freq) {
        if (samples.size() < numSamples) {
          samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                           reader.getCurrentKey(), null));
        } else {
          // When exceeding the maximum number of samples, replace a
          // random element with this one, then adjust the frequency
          // to reflect the possibility of existing elements being
          // pushed out
          int ind = r.nextInt(numSamples);
          if (ind != numSamples) {
            samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
          }
          freq *= (numSamples - 1) / (double) numSamples;
        }
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Source File: InputSampler.java From hadoop with Apache License 2.0

4 votes

/**
 * Randomize the split order, then take the specified number of keys from
 * each split sampled, where each key is selected with the specified
 * probability and possibly replaced by a subsequently selected key when
 * the quota of keys from that split is satisfied.
 */
@SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K,V> inf, Job job) 
    throws IOException, InterruptedException {
  List<InputSplit> splits = inf.getSplits(job);
  ArrayList<K> samples = new ArrayList<K>(numSamples);
  int splitsToSample = Math.min(maxSplitsSampled, splits.size());

  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);
  LOG.debug("seed: " + seed);
  // shuffle splits
  for (int i = 0; i < splits.size(); ++i) {
    InputSplit tmp = splits.get(i);
    int j = r.nextInt(splits.size());
    splits.set(i, splits.get(j));
    splits.set(j, tmp);
  }
  // our target rate is in terms of the maximum number of sample splits,
  // but we accept the possibility of sampling additional splits to hit
  // the target sample keyset
  for (int i = 0; i < splitsToSample ||
                 (i < splits.size() && samples.size() < numSamples); ++i) {
    TaskAttemptContext samplingContext = new TaskAttemptContextImpl(
        job.getConfiguration(), new TaskAttemptID());
    RecordReader<K,V> reader = inf.createRecordReader(
        splits.get(i), samplingContext);
    reader.initialize(splits.get(i), samplingContext);
    while (reader.nextKeyValue()) {
      if (r.nextDouble() <= freq) {
        if (samples.size() < numSamples) {
          samples.add(ReflectionUtils.copy(job.getConfiguration(),
                                           reader.getCurrentKey(), null));
        } else {
          // When exceeding the maximum number of samples, replace a
          // random element with this one, then adjust the frequency
          // to reflect the possibility of existing elements being
          // pushed out
          int ind = r.nextInt(numSamples);
          if (ind != numSamples) {
            samples.set(ind, ReflectionUtils.copy(job.getConfiguration(),
                             reader.getCurrentKey(), null));
          }
          freq *= (numSamples - 1) / (double) numSamples;
        }
      }
    }
    reader.close();
  }
  return (K[])samples.toArray();
}

Source File: TestCombineSequenceFileInputFormat.java From hadoop with Apache License 2.0

4 votes

@Test(timeout=10000)
public void testFormat() throws IOException, InterruptedException {
  Job job = Job.getInstance(conf);

  Random random = new Random();
  long seed = random.nextLong();
  random.setSeed(seed);

  localFs.delete(workDir, true);
  FileInputFormat.setInputPaths(job, workDir);

  final int length = 10000;
  final int numFiles = 10;

  // create files with a variety of lengths
  createFiles(length, numFiles, random, job);

  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  // create a combine split for the files
  InputFormat<IntWritable,BytesWritable> format =
    new CombineSequenceFileInputFormat<IntWritable,BytesWritable>();
  for (int i = 0; i < 3; i++) {
    int numSplits =
      random.nextInt(length/(SequenceFile.SYNC_INTERVAL/20)) + 1;
    LOG.info("splitting: requesting = " + numSplits);
    List<InputSplit> splits = format.getSplits(job);
    LOG.info("splitting: got =        " + splits.size());

    // we should have a single split as the length is comfortably smaller than
    // the block size
    assertEquals("We got more than one splits!", 1, splits.size());
    InputSplit split = splits.get(0);
    assertEquals("It should be CombineFileSplit",
      CombineFileSplit.class, split.getClass());

    // check the split
    BitSet bits = new BitSet(length);
    RecordReader<IntWritable,BytesWritable> reader =
      format.createRecordReader(split, context);
    MapContext<IntWritable,BytesWritable,IntWritable,BytesWritable> mcontext =
      new MapContextImpl<IntWritable,BytesWritable,IntWritable,BytesWritable>(job.getConfiguration(),
      context.getTaskAttemptID(), reader, null, null,
      MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    assertEquals("reader class is CombineFileRecordReader.",
      CombineFileRecordReader.class, reader.getClass());

    try {
      while (reader.nextKeyValue()) {
        IntWritable key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();
        assertNotNull("Value should not be null.", value);
        final int k = key.get();
        LOG.debug("read " + k);
        assertFalse("Key in multiple partitions.", bits.get(k));
        bits.set(k);
      }
    } finally {
      reader.close();
    }
    assertEquals("Some keys in no partition.", length, bits.cardinality());
  }
}

Source File: TestMRSequenceFileAsBinaryInputFormat.java From hadoop with Apache License 2.0

4 votes

public void testBinary() throws IOException, InterruptedException {
  Job job = Job.getInstance();
  FileSystem fs = FileSystem.getLocal(job.getConfiguration());
  Path dir = new Path(System.getProperty("test.build.data",".") + "/mapred");
  Path file = new Path(dir, "testbinary.seq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  fs.delete(dir, true);
  FileInputFormat.setInputPaths(job, dir);

  Text tkey = new Text();
  Text tval = new Text();

  SequenceFile.Writer writer = new SequenceFile.Writer(fs,
    job.getConfiguration(), file, Text.class, Text.class);
  try {
    for (int i = 0; i < RECORDS; ++i) {
      tkey.set(Integer.toString(r.nextInt(), 36));
      tval.set(Long.toString(r.nextLong(), 36));
      writer.append(tkey, tval);
    }
  } finally {
    writer.close();
  }
  TaskAttemptContext context = MapReduceTestUtil.
    createDummyMapTaskAttemptContext(job.getConfiguration());
  InputFormat<BytesWritable,BytesWritable> bformat =
    new SequenceFileAsBinaryInputFormat();

  int count = 0;
  r.setSeed(seed);
  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();
  Text cmpkey = new Text();
  Text cmpval = new Text();
  DataInputBuffer buf = new DataInputBuffer();
  FileInputFormat.setInputPaths(job, file);
  for (InputSplit split : bformat.getSplits(job)) {
    RecordReader<BytesWritable, BytesWritable> reader =
          bformat.createRecordReader(split, context);
    MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<BytesWritable, BytesWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      while (reader.nextKeyValue()) {
        bkey = reader.getCurrentKey();
        bval = reader.getCurrentValue();
        tkey.set(Integer.toString(r.nextInt(), 36));
        tval.set(Long.toString(r.nextLong(), 36));
        buf.reset(bkey.getBytes(), bkey.getLength());
        cmpkey.readFields(buf);
        buf.reset(bval.getBytes(), bval.getLength());
        cmpval.readFields(buf);
        assertTrue(
          "Keys don't match: " + "*" + cmpkey.toString() + ":" +
          tkey.toString() + "*",
          cmpkey.toString().equals(tkey.toString()));
        assertTrue(
          "Vals don't match: " + "*" + cmpval.toString() + ":" +
          tval.toString() + "*",
          cmpval.toString().equals(tval.toString()));
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}

Source File: TestMRSequenceFileAsBinaryOutputFormat.java From hadoop with Apache License 2.0

4 votes

public void testBinary() throws IOException, InterruptedException {
  Configuration conf = new Configuration();
  Job job = Job.getInstance(conf);
  
  Path outdir = new Path(System.getProperty("test.build.data", "/tmp"),
                  "outseq");
  Random r = new Random();
  long seed = r.nextLong();
  r.setSeed(seed);

  FileOutputFormat.setOutputPath(job, outdir);

  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, 
                                        IntWritable.class );
  SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, 
                                        DoubleWritable.class ); 

  SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
  SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, 
                                                     CompressionType.BLOCK);

  BytesWritable bkey = new BytesWritable();
  BytesWritable bval = new BytesWritable();

  TaskAttemptContext context = 
    MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
  OutputFormat<BytesWritable, BytesWritable> outputFormat = 
    new SequenceFileAsBinaryOutputFormat();
  OutputCommitter committer = outputFormat.getOutputCommitter(context);
  committer.setupJob(job);
  RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.
    getRecordWriter(context);

  IntWritable iwritable = new IntWritable();
  DoubleWritable dwritable = new DoubleWritable();
  DataOutputBuffer outbuf = new DataOutputBuffer();
  LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
  try {
    for (int i = 0; i < RECORDS; ++i) {
      iwritable = new IntWritable(r.nextInt());
      iwritable.write(outbuf);
      bkey.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      dwritable = new DoubleWritable(r.nextDouble());
      dwritable.write(outbuf);
      bval.set(outbuf.getData(), 0, outbuf.getLength());
      outbuf.reset();
      writer.write(bkey, bval);
    }
  } finally {
    writer.close(context);
  }
  committer.commitTask(context);
  committer.commitJob(job);

  InputFormat<IntWritable, DoubleWritable> iformat =
    new SequenceFileInputFormat<IntWritable, DoubleWritable>();
  int count = 0;
  r.setSeed(seed);
  SequenceFileInputFormat.setInputPaths(job, outdir);
  LOG.info("Reading data by SequenceFileInputFormat");
  for (InputSplit split : iformat.getSplits(job)) {
    RecordReader<IntWritable, DoubleWritable> reader =
      iformat.createRecordReader(split, context);
    MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> 
      mcontext = new MapContextImpl<IntWritable, DoubleWritable,
        BytesWritable, BytesWritable>(job.getConfiguration(), 
        context.getTaskAttemptID(), reader, null, null, 
        MapReduceTestUtil.createDummyReporter(), 
        split);
    reader.initialize(split, mcontext);
    try {
      int sourceInt;
      double sourceDouble;
      while (reader.nextKeyValue()) {
        sourceInt = r.nextInt();
        sourceDouble = r.nextDouble();
        iwritable = reader.getCurrentKey();
        dwritable = reader.getCurrentValue();
        assertEquals(
            "Keys don't match: " + "*" + iwritable.get() + ":" + 
                                         sourceInt + "*",
            sourceInt, iwritable.get());
        assertTrue(
            "Vals don't match: " + "*" + dwritable.get() + ":" +
                                         sourceDouble + "*",
            Double.compare(dwritable.get(), sourceDouble) == 0 );
        ++count;
      }
    } finally {
      reader.close();
    }
  }
  assertEquals("Some records not found", RECORDS, count);
}

Java Code Examples for org.apache.hadoop.mapreduce.InputFormat#getSplits()