Python torch.autograd.backward() Examples
The following are 2
code examples of torch.autograd.backward().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
torch.autograd
, or try the search function
.
Example #1
Source File: actor_critic.py From pytorchrl with MIT License | 6 votes |
def finish_episode(): R = 0 saved_actions = model.saved_actions value_loss = 0 rewards = [] for r in model.rewards[::-1]: R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for (action, value), r in zip(saved_actions, rewards): reward = r - value.data[0,0] action.reinforce(reward) value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r]))) optimizer.zero_grad() final_nodes = [value_loss] + list(map(lambda p: p.action, saved_actions)) gradients = [torch.ones(1)] + [None] * len(saved_actions) autograd.backward(final_nodes, gradients) optimizer.step() del model.rewards[:] del model.saved_actions[:]
Example #2
Source File: acer_single_process.py From pytorch-rl with MIT License | 5 votes |
def _off_policy_rollout(self): # reset rollout experiences self._reset_rollout() # first sample trajectories trajectories = self.memory.sample_batch(self.master.batch_size, maxlen=self.master.rollout_steps) # NOTE: we also store another set of undetached unsplitted policy_vb here to prepare for backward unsplitted_policy_vb = [] # then fake the on-policy forward for t in range(len(trajectories) - 1): # we first get the data out of the sampled experience state0 = np.stack((trajectory.state0 for trajectory in trajectories[t])) action = np.expand_dims(np.stack((trajectory.action for trajectory in trajectories[t])), axis=1) reward = np.expand_dims(np.stack((trajectory.reward for trajectory in trajectories[t])), axis=1) state1 = np.stack((trajectory.state0 for trajectory in trajectories[t+1])) terminal1 = np.expand_dims(np.stack((1 if trajectory.action is None else 0 for trajectory in trajectories[t+1])), axis=1) # NOTE: here is 0/1, in on-policy is False/True detached_old_policy_vb = torch.cat([trajectory.detached_old_policy_vb for trajectory in trajectories[t]], 0) # NOTE: here first store the last frame: experience.state1 as rollout.state0 self.rollout.state0.append(state0) # then get its corresponding output variables to fake the on policy experience if self.master.enable_continuous: pass else: _, p_vb, q_vb, v_vb, avg_p_vb = self._forward(self._preprocessState(self.rollout.state0[-1], on_policy=False), on_policy=False) # push experience into rollout self.rollout.action.append(action) self.rollout.reward.append(reward) self.rollout.state1.append(state1) self.rollout.terminal1.append(terminal1) self.rollout.policy_vb.append(p_vb.split(1, 0)) # NOTE: must split before detach !!! otherwise graph is cut self.rollout.q0_vb.append(q_vb) self.rollout.value0_vb.append(v_vb) self.rollout.detached_avg_policy_vb.append(avg_p_vb.detach()) # NOTE self.rollout.detached_old_policy_vb.append(detached_old_policy_vb) unsplitted_policy_vb.append(p_vb) # also need to log some training stats here maybe return unsplitted_policy_vb