org.apache.tez.mapreduce.protos.MRRuntimeProtos Java Examples

The following examples show how to use org.apache.tez.mapreduce.protos.MRRuntimeProtos. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MRInputHelpers.java    From tez with Apache License 2.0 6 votes vote down vote up
/**
 * Create an instance of {@link org.apache.hadoop.mapred.InputSplit} from the {@link
 * org.apache.tez.mapreduce.input.MRInput} representation of a split.
 *
 * @param splitProto           The {@link org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto}
 *                             instance representing the split
 * @param serializationFactory the serialization mechanism used to write out the split
 * @return an instance of the split
 * @throws java.io.IOException
 */
@SuppressWarnings("unchecked")
@InterfaceStability.Evolving
@InterfaceAudience.LimitedPrivate({"hive, pig"})
public static InputSplit createOldFormatSplitFromUserPayload(
    MRRuntimeProtos.MRSplitProto splitProto, SerializationFactory serializationFactory)
    throws IOException {
  // This may not need to use serialization factory, since OldFormat
  // always uses Writable to write splits.
  Objects.requireNonNull(splitProto, "splitProto cannot be null");
  String className = splitProto.getSplitClassName();
  Class<InputSplit> clazz;

  try {
    clazz = (Class<InputSplit>) Class.forName(className);
  } catch (ClassNotFoundException e) {
    throw new IOException("Failed to load InputSplit class: [" + className + "]", e);
  }

  Deserializer<InputSplit> deserializer = serializationFactory
      .getDeserializer(clazz);
  deserializer.open(splitProto.getSplitBytes().newInput());
  InputSplit inputSplit = deserializer.deserialize(null);
  deserializer.close();
  return inputSplit;
}
 
Example #2
Source File: MRInputHelpers.java    From tez with Apache License 2.0 6 votes vote down vote up
/**
 * Create an instance of {@link org.apache.hadoop.mapreduce.InputSplit} from the {@link
 * org.apache.tez.mapreduce.input.MRInput} representation of a split.
 *
 * @param splitProto           The {@link org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto}
 *                             instance representing the split
 * @param serializationFactory the serialization mechanism used to write out the split
 * @return an instance of the split
 * @throws IOException
 */
@InterfaceStability.Evolving
@SuppressWarnings("unchecked")
public static org.apache.hadoop.mapreduce.InputSplit createNewFormatSplitFromUserPayload(
    MRRuntimeProtos.MRSplitProto splitProto, SerializationFactory serializationFactory)
    throws IOException {
  Objects.requireNonNull(splitProto, "splitProto must be specified");
  String className = splitProto.getSplitClassName();
  Class<org.apache.hadoop.mapreduce.InputSplit> clazz;

  try {
    clazz = (Class<org.apache.hadoop.mapreduce.InputSplit>) Class
        .forName(className);
  } catch (ClassNotFoundException e) {
    throw new IOException("Failed to load InputSplit class: [" + className + "]", e);
  }

  Deserializer<org.apache.hadoop.mapreduce.InputSplit> deserializer = serializationFactory
      .getDeserializer(clazz);
  deserializer.open(splitProto.getSplitBytes().newInput());
  org.apache.hadoop.mapreduce.InputSplit inputSplit = deserializer
      .deserialize(null);
  deserializer.close();
  return inputSplit;
}
 
Example #3
Source File: MRInputHelpers.java    From tez with Apache License 2.0 6 votes vote down vote up
@InterfaceStability.Evolving
public static <T extends org.apache.hadoop.mapreduce.InputSplit> MRRuntimeProtos.MRSplitProto createSplitProto(
    T newSplit, SerializationFactory serializationFactory)
    throws IOException, InterruptedException {
  MRRuntimeProtos.MRSplitProto.Builder builder = MRRuntimeProtos.MRSplitProto
      .newBuilder();

  builder.setSplitClassName(newSplit.getClass().getName());

  @SuppressWarnings("unchecked")
  Serializer<T> serializer = serializationFactory
      .getSerializer((Class<T>) newSplit.getClass());
  ByteString.Output out = ByteString
      .newOutput(SPLIT_SERIALIZED_LENGTH_ESTIMATE);
  serializer.open(out);
  serializer.serialize(newSplit);
  // TODO MR Compat: Check against max block locations per split.
  ByteString splitBs = out.toByteString();
  builder.setSplitBytes(splitBs);

  return builder.build();
}
 
Example #4
Source File: MRInputHelpers.java    From tez with Apache License 2.0 6 votes vote down vote up
@InterfaceStability.Evolving
@InterfaceAudience.LimitedPrivate({"hive, pig"})
public static MRRuntimeProtos.MRSplitProto createSplitProto(
    org.apache.hadoop.mapred.InputSplit oldSplit) throws IOException {
  MRRuntimeProtos.MRSplitProto.Builder builder = MRRuntimeProtos.MRSplitProto.newBuilder();

  builder.setSplitClassName(oldSplit.getClass().getName());

  ByteString.Output os = ByteString
      .newOutput(SPLIT_SERIALIZED_LENGTH_ESTIMATE);
  oldSplit.write(new NonSyncDataOutputStream(os));
  ByteString splitBs = os.toByteString();
  builder.setSplitBytes(splitBs);

  return builder.build();
}
 
Example #5
Source File: MRInputBase.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
public List<Event> initialize() throws IOException {
  getContext().requestInitialMemory(0l, null); // mandatory call
  MRRuntimeProtos.MRInputUserPayloadProto mrUserPayload =
      MRHelpers.parseMRInputPayload(getContext().getUserPayload());
  Preconditions.checkArgument(mrUserPayload.hasSplits() == false,
      "Split information not expected in " + this.getClass().getName());
  Configuration conf = MRHelpers.createConfFromByteString(mrUserPayload.getConfigurationBytes());

  this.jobConf = new JobConf(conf);
  // Add tokens to the jobConf - in case they are accessed within the RR / IF
  jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());

  TaskAttemptID taskAttemptId = new TaskAttemptID(
      new TaskID(
          Long.toString(getContext().getApplicationId().getClusterTimestamp()),
          getContext().getApplicationId().getId(), TaskType.MAP,
          getContext().getTaskIndex()),
      getContext().getTaskAttemptNumber());

  jobConf.set(MRJobConfig.TASK_ATTEMPT_ID,
      taskAttemptId.toString());
  jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
      getContext().getDAGAttemptNumber());

  this.inputRecordCounter = getContext().getCounters().findCounter(
      TaskCounter.INPUT_RECORDS_PROCESSED);

  useNewApi = this.jobConf.getUseNewMapper();
  return null;
}
 
Example #6
Source File: MRInputHelpers.java    From tez with Apache License 2.0 5 votes vote down vote up
/**
 * When isGrouped is true, it specifies that grouping of input splits be
 * performed by Tez The conf should have the input format class configuration
 * set to the TezGroupedSplitsInputFormat. The real input format class name
 * should be passed as an argument to this method.
 * <p/>
 * With grouping enabled, the eventual configuration used by the tasks, will have
 * the user-specified InputFormat replaced by either {@link org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat}
 * or {@link org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat}
 */
@InterfaceAudience.Private
protected static UserPayload createMRInputPayload(Configuration conf,
    MRRuntimeProtos.MRSplitsProto mrSplitsProto, boolean isGrouped,
    boolean isSorted) throws
        IOException {
  Preconditions
      .checkArgument(conf != null, "Configuration must be specified");

  return createMRInputPayload(TezUtils.createByteStringFromConf(conf),
      mrSplitsProto, isGrouped, isSorted);
}
 
Example #7
Source File: MRInputHelpers.java    From tez with Apache License 2.0 5 votes vote down vote up
private static UserPayload createMRInputPayload(ByteString bytes,
  MRRuntimeProtos.MRSplitsProto mrSplitsProto,
  boolean isGrouped, boolean isSorted) throws IOException {
  MRRuntimeProtos.MRInputUserPayloadProto.Builder userPayloadBuilder =
      MRRuntimeProtos.MRInputUserPayloadProto
          .newBuilder();
  userPayloadBuilder.setConfigurationBytes(bytes);
  if (mrSplitsProto != null) {
    userPayloadBuilder.setSplits(mrSplitsProto);
  }
  userPayloadBuilder.setGroupingEnabled(isGrouped);
  userPayloadBuilder.setSortSplitsEnabled(isSorted);
  return UserPayload.create(userPayloadBuilder.build().
      toByteString().asReadOnlyByteBuffer());
}
 
Example #8
Source File: MRInputHelpers.java    From tez with Apache License 2.0 4 votes vote down vote up
@InterfaceAudience.Private
protected static UserPayload createMRInputPayload(Configuration conf,
                                                  MRRuntimeProtos.MRSplitsProto mrSplitsProto) throws
    IOException {
  return createMRInputPayload(conf, mrSplitsProto, false, true);
}
 
Example #9
Source File: YARNRunner.java    From tez with Apache License 2.0 4 votes vote down vote up
protected static UserPayload createMRInputPayload(Configuration conf,
    MRRuntimeProtos.MRSplitsProto mrSplitsProto) throws
        IOException {
  return MRInputHelpers.createMRInputPayload(conf, mrSplitsProto, false,
      true);
}
 
Example #10
Source File: MRInputBase.java    From tez with Apache License 2.0 4 votes vote down vote up
public List<Event> initialize() throws IOException {
  getContext().requestInitialMemory(0l, null); // mandatory call
  MRRuntimeProtos.MRInputUserPayloadProto mrUserPayload =
      MRInputHelpers.parseMRInputPayload(getContext().getUserPayload());
  boolean isGrouped = mrUserPayload.getGroupingEnabled();
  Preconditions.checkArgument(mrUserPayload.hasSplits() == false,
      "Split information not expected in " + this.getClass().getName());

  Configuration conf = new JobConf(getContext().getContainerConfiguration());
  TezUtils.addToConfFromByteString(conf, mrUserPayload.getConfigurationBytes());
  this.jobConf = new JobConf(conf);
  useNewApi = this.jobConf.getUseNewMapper();
  if (isGrouped) {
    if (useNewApi) {
      jobConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR,
          org.apache.hadoop.mapreduce.split.TezGroupedSplitsInputFormat.class.getName());
    } else {
      jobConf.set("mapred.input.format.class",
          org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat.class.getName());
    }
  }


  // Add tokens to the jobConf - in case they are accessed within the RR / IF
  jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());

  TaskAttemptID taskAttemptId = new TaskAttemptID(
      new TaskID(
          Long.toString(getContext().getApplicationId().getClusterTimestamp()),
          getContext().getApplicationId().getId(), TaskType.MAP,
          getContext().getTaskIndex()),
      getContext().getTaskAttemptNumber());

  jobConf.set(MRJobConfig.TASK_ATTEMPT_ID,
      taskAttemptId.toString());
  jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID,
      getContext().getDAGAttemptNumber());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_DAG_INDEX, getContext().getDagIdentifier());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_VERTEX_INDEX, getContext().getTaskVertexIndex());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_TASK_INDEX, getContext().getTaskIndex());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_TASK_ATTEMPT_INDEX, getContext().getTaskAttemptNumber());
  jobConf.set(MRInput.TEZ_MAPREDUCE_DAG_NAME, getContext().getDAGName());
  jobConf.set(MRInput.TEZ_MAPREDUCE_VERTEX_NAME, getContext().getTaskVertexName());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_INPUT_INDEX, getContext().getInputIndex());
  jobConf.set(MRInput.TEZ_MAPREDUCE_INPUT_NAME, getContext().getSourceVertexName());
  jobConf.set(MRInput.TEZ_MAPREDUCE_APPLICATION_ID, getContext().getApplicationId().toString());
  jobConf.set(MRInput.TEZ_MAPREDUCE_UNIQUE_IDENTIFIER, getContext().getUniqueIdentifier());
  jobConf.setInt(MRInput.TEZ_MAPREDUCE_DAG_ATTEMPT_NUMBER, getContext().getDAGAttemptNumber());

  TezDAGID tezDAGID = TezDAGID.getInstance(getContext().getApplicationId(), getContext().getDagIdentifier());
  TezVertexID tezVertexID = TezVertexID.getInstance(tezDAGID, getContext().getTaskVertexIndex());
  TezTaskID tezTaskID = TezTaskID.getInstance(tezVertexID, getContext().getTaskIndex());
  TezTaskAttemptID tezTaskAttemptID = TezTaskAttemptID.getInstance(tezTaskID, getContext().getTaskAttemptNumber());
  jobConf.set(MRInput.TEZ_MAPREDUCE_DAG_ID, tezDAGID.toString());
  jobConf.set(MRInput.TEZ_MAPREDUCE_VERTEX_ID, tezVertexID.toString());
  jobConf.set(MRInput.TEZ_MAPREDUCE_TASK_ID, tezTaskID.toString());
  jobConf.set(MRInput.TEZ_MAPREDUCE_TASK_ATTEMPT_ID, tezTaskAttemptID.toString());

  this.inputRecordCounter = getContext().getCounters().findCounter(
      TaskCounter.INPUT_RECORDS_PROCESSED);


  return null;
}
 
Example #11
Source File: MRInput.java    From tez with Apache License 2.0 4 votes vote down vote up
protected static UserPayload createMRInputPayload(Configuration conf,
    MRRuntimeProtos.MRSplitsProto mrSplitsProto) throws
    IOException {
  return MRInputHelpers.createMRInputPayload(conf, mrSplitsProto, false,
      true);
}
 
Example #12
Source File: TestMapProcessor.java    From tez with Apache License 2.0 4 votes vote down vote up
@Test(timeout = 5000)
  public void testMapProcessor() throws Exception {
    String dagName = "mrdag0";
    String vertexName = MultiStageMRConfigUtil.getInitialMapVertexName();
    JobConf jobConf = new JobConf(defaultConf);
    setUpJobConf(jobConf);

    MRHelpers.translateMRConfToTez(jobConf);
    jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);

    jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);

    jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir,
        "localized-resources").toUri().toString());
    
    Path mapInput = new Path(workDir, "map0");
    
    
    MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput, 10);

    InputSpec mapInputSpec = new InputSpec("NullSrcVertex",
        InputDescriptor.create(MRInputLegacy.class.getName())
            .setUserPayload(UserPayload.create(ByteBuffer.wrap(
                MRRuntimeProtos.MRInputUserPayloadProto.newBuilder()
                    .setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf)).build()
                    .toByteArray()))),
        1);
    OutputSpec mapOutputSpec = new OutputSpec("NullDestVertex", 
        OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName())
            .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1);

    TezSharedExecutor sharedExecutor = new TezSharedExecutor(jobConf);
    LogicalIOProcessorRuntimeTask task = MapUtils.createLogicalTask(localFs, workDir, jobConf, 0,
        new Path(workDir, "map0"), new TestUmbilical(), dagName, vertexName,
        Collections.singletonList(mapInputSpec), Collections.singletonList(mapOutputSpec),
        sharedExecutor);

    task.initialize();
    task.run();
    task.close();
    sharedExecutor.shutdownNow();

    OutputContext outputContext = task.getOutputContexts().iterator().next();
    TezTaskOutput mapOutputs = new TezTaskOutputFiles(
        jobConf, outputContext.getUniqueIdentifier(),
        outputContext.getDagIdentifier());
    
    
    // TODO NEWTEZ FIXME OutputCommitter verification
//    MRTask mrTask = (MRTask)t.getProcessor();
//    Assert.assertEquals(TezNullOutputCommitter.class.getName(), mrTask
//        .getCommitter().getClass().getName());
//    t.close();

    Path mapOutputFile = getMapOutputFile(jobConf, outputContext);
    LOG.info("mapOutputFile = " + mapOutputFile);
    IFile.Reader reader =
        new IFile.Reader(localFs, mapOutputFile, null, null, null, false, 0, -1);
    LongWritable key = new LongWritable();
    Text value = new Text();
    DataInputBuffer keyBuf = new DataInputBuffer();
    DataInputBuffer valueBuf = new DataInputBuffer();
    long prev = Long.MIN_VALUE;
    while (reader.nextRawKey(keyBuf)) {
      reader.nextRawValue(valueBuf);
      key.readFields(keyBuf);
      value.readFields(valueBuf);
      if (prev != Long.MIN_VALUE) {
        assert(prev <= key.get());
        prev = key.get();
      }
      LOG.info("key = " + key.get() + "; value = " + value);
    }
    reader.close();
  }
 
Example #13
Source File: TestMapProcessor.java    From tez with Apache License 2.0 4 votes vote down vote up
@Test(timeout = 30000)
public void testMapProcessorProgress() throws Exception {
  String dagName = "mrdag0";
  String vertexName = MultiStageMRConfigUtil.getInitialMapVertexName();
  JobConf jobConf = new JobConf(defaultConf);
  setUpJobConf(jobConf);

  MRHelpers.translateMRConfToTez(jobConf);
  jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);

  jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);

  jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir,
      "localized-resources").toUri().toString());

  Path mapInput = new Path(workDir, "map0");


  MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput, 100000);

  InputSpec mapInputSpec = new InputSpec("NullSrcVertex",
      InputDescriptor.create(MRInputLegacy.class.getName())
          .setUserPayload(UserPayload.create(ByteBuffer.wrap(
              MRRuntimeProtos.MRInputUserPayloadProto.newBuilder()
                  .setConfigurationBytes(TezUtils.createByteStringFromConf
                      (jobConf)).build()
                  .toByteArray()))),
      1);
  OutputSpec mapOutputSpec = new OutputSpec("NullDestVertex",
      OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName())
          .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1);

  TezSharedExecutor sharedExecutor = new TezSharedExecutor(jobConf);
  final LogicalIOProcessorRuntimeTask task = MapUtils.createLogicalTask
      (localFs, workDir, jobConf, 0,
          new Path(workDir, "map0"), new TestUmbilical(), dagName, vertexName,
          Collections.singletonList(mapInputSpec),
          Collections.singletonList(mapOutputSpec), sharedExecutor);

  ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1);
  Thread monitorProgress = new Thread(new Runnable() {
    @Override
    public void run() {
      float prog = task.getProgress();
      if(prog > 0.0f && prog < 1.0f)
        progressUpdate = prog;
    }
  });

  task.initialize();
  scheduler.scheduleAtFixedRate(monitorProgress, 0, 1,
      TimeUnit.MILLISECONDS);
  task.run();
  Assert.assertTrue("Progress Updates should be captured!",
      progressUpdate > 0.0f && progressUpdate < 1.0f);
  task.close();
  sharedExecutor.shutdownNow();
}
 
Example #14
Source File: TestMRInput.java    From tez with Apache License 2.0 4 votes vote down vote up
@Test(timeout = 5000)
public void testAttributesInJobConf() throws Exception {
  InputContext inputContext = mock(InputContext.class);
  doReturn(TEST_ATTRIBUTES_DAG_INDEX).when(inputContext).getDagIdentifier();
  doReturn(TEST_ATTRIBUTES_VERTEX_INDEX).when(inputContext).getTaskVertexIndex();
  doReturn(TEST_ATTRIBUTES_TASK_INDEX).when(inputContext).getTaskIndex();
  doReturn(TEST_ATTRIBUTES_TASK_ATTEMPT_INDEX).when(inputContext).getTaskAttemptNumber();
  doReturn(TEST_ATTRIBUTES_INPUT_INDEX).when(inputContext).getInputIndex();
  doReturn(TEST_ATTRIBUTES_DAG_ATTEMPT_NUMBER).when(inputContext).getDAGAttemptNumber();
  doReturn(TEST_ATTRIBUTES_DAG_NAME).when(inputContext).getDAGName();
  doReturn(TEST_ATTRIBUTES_VERTEX_NAME).when(inputContext).getTaskVertexName();
  doReturn(TEST_ATTRIBUTES_INPUT_NAME).when(inputContext).getSourceVertexName();
  doReturn(TEST_ATTRIBUTES_APPLICATION_ID).when(inputContext).getApplicationId();
  doReturn(TEST_ATTRIBUTES_UNIQUE_IDENTIFIER).when(inputContext).getUniqueIdentifier();
  doReturn(new Configuration(false)).when(inputContext).getContainerConfiguration();


  DataSourceDescriptor dsd = MRInput.createConfigBuilder(new Configuration(false),
      TestInputFormat.class).groupSplits(false).build();

  doReturn(dsd.getInputDescriptor().getUserPayload()).when(inputContext).getUserPayload();
  doReturn(new TezCounters()).when(inputContext).getCounters();


  MRInput mrInput = new MRInput(inputContext, 1);
  mrInput.initialize();

  MRRuntimeProtos.MRSplitProto splitProto =
      MRRuntimeProtos.MRSplitProto.newBuilder().setSplitClassName(TestInputSplit.class.getName())
          .build();
  InputDataInformationEvent diEvent = InputDataInformationEvent
      .createWithSerializedPayload(0, splitProto.toByteString().asReadOnlyByteBuffer());

  List<Event> events = new LinkedList<>();
  events.add(diEvent);
  mrInput.handleEvents(events);
  TezCounter counter = mrInput.getContext().getCounters()
      .findCounter(TaskCounter.INPUT_SPLIT_LENGTH_BYTES);
  assertEquals(counter.getValue(), TestInputSplit.length);
  assertTrue(TestInputFormat.invoked.get());
}
 
Example #15
Source File: MRInputHelpers.java    From tez with Apache License 2.0 3 votes vote down vote up
/**
 * Parse the payload used by MRInputPayload
 *
 * @param payload the {@link org.apache.tez.dag.api.UserPayload} instance
 * @return an instance of {@link org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRInputUserPayloadProto},
 * which provides access to the underlying configuration bytes
 * @throws IOException
 */
@InterfaceStability.Evolving
@InterfaceAudience.LimitedPrivate({"hive, pig"})
public static MRRuntimeProtos.MRInputUserPayloadProto parseMRInputPayload(UserPayload payload)
    throws IOException {
  return MRRuntimeProtos.MRInputUserPayloadProto.parseFrom(ByteString.copyFrom(payload.getPayload()));
}