Python torch.autograd.backward() Examples

The following are 2 code examples of torch.autograd.backward(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module torch.autograd , or try the search function

Example #1

Source File: actor_critic.py From pytorchrl with MIT License

6 votes

def finish_episode():
    R = 0
    saved_actions = model.saved_actions
    value_loss = 0
    rewards = []
    for r in model.rewards[::-1]:
        R = r + args.gamma * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    for (action, value), r in zip(saved_actions, rewards):
        reward = r - value.data[0,0]
        action.reinforce(reward)
        value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r])))
    optimizer.zero_grad()
    final_nodes = [value_loss] + list(map(lambda p: p.action, saved_actions))
    gradients = [torch.ones(1)] + [None] * len(saved_actions)
    autograd.backward(final_nodes, gradients)
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]

Example #2

Source File: acer_single_process.py From pytorch-rl with MIT License

5 votes

def _off_policy_rollout(self):
        # reset rollout experiences
        self._reset_rollout()

        # first sample trajectories
        trajectories = self.memory.sample_batch(self.master.batch_size, maxlen=self.master.rollout_steps)
        # NOTE: we also store another set of undetached unsplitted policy_vb here to prepare for backward
        unsplitted_policy_vb = []

        # then fake the on-policy forward
        for t in range(len(trajectories) - 1):
            # we first get the data out of the sampled experience
            state0 = np.stack((trajectory.state0 for trajectory in trajectories[t]))
            action = np.expand_dims(np.stack((trajectory.action for trajectory in trajectories[t])), axis=1)
            reward = np.expand_dims(np.stack((trajectory.reward for trajectory in trajectories[t])), axis=1)
            state1 = np.stack((trajectory.state0 for trajectory in trajectories[t+1]))
            terminal1 = np.expand_dims(np.stack((1 if trajectory.action is None else 0 for trajectory in trajectories[t+1])), axis=1) # NOTE: here is 0/1, in on-policy is False/True
            detached_old_policy_vb = torch.cat([trajectory.detached_old_policy_vb for trajectory in trajectories[t]], 0)

            # NOTE: here first store the last frame: experience.state1 as rollout.state0
            self.rollout.state0.append(state0)
            # then get its corresponding output variables to fake the on policy experience
            if self.master.enable_continuous:
                pass
            else:
                _, p_vb, q_vb, v_vb, avg_p_vb = self._forward(self._preprocessState(self.rollout.state0[-1], on_policy=False), on_policy=False)
            # push experience into rollout
            self.rollout.action.append(action)
            self.rollout.reward.append(reward)
            self.rollout.state1.append(state1)
            self.rollout.terminal1.append(terminal1)
            self.rollout.policy_vb.append(p_vb.split(1, 0)) # NOTE: must split before detach !!! otherwise graph is cut
            self.rollout.q0_vb.append(q_vb)
            self.rollout.value0_vb.append(v_vb)
            self.rollout.detached_avg_policy_vb.append(avg_p_vb.detach()) # NOTE
            self.rollout.detached_old_policy_vb.append(detached_old_policy_vb)
            unsplitted_policy_vb.append(p_vb)

        # also need to log some training stats here maybe

        return unsplitted_policy_vb