Python baselines.common.discount() Examples

The following are 1 code examples of baselines.common.discount(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module baselines.common , or try the search function

Example #1

Source File: a2c_cont.py From BackpropThroughTheVoidRL with MIT License

4 votes

def run(self, update_counters=True):
        ob = self.env.reset()
        prev_ob = np.float32(np.zeros(ob.shape))
        if self.obfilter: ob = self.obfilter(ob)
        terminated = False
    
        obs = []
        acs = []
        ac_dists = []
        logps = []
        rewards = []

        for _ in range(self.max_pathlength):
            if self.animate:
                self.env.render()
            state = np.concatenate([ob, prev_ob], -1)
            obs.append(state)
            ac, ac_dist, logp = self.policy.act(state)
            acs.append(ac)
            ac_dists.append(ac_dist)
            logps.append(logp)
            prev_ob = np.copy(ob)
            scaled_ac = self.env.action_space.low + (ac + 1.) * 0.5 * (self.env.action_space.high - self.env.action_space.low)
            scaled_ac = np.clip(scaled_ac, self.env.action_space.low, self.env.action_space.high)
            ob, rew, done, _ = self.env.step(scaled_ac)
            if self.obfilter: ob = self.obfilter(ob)
            rewards.append(rew)
            if done:
                terminated = True
                break
        self.rewards.append(sum(rewards))
        self.rewards = self.rewards[-100:]
        if update_counters:
            self._num_rollouts += 1
            self._num_steps += len(rewards)
              
        path = {"observation" : np.array(obs), "terminated" : terminated,
                "reward" : np.array(rewards), "action" : np.array(acs),
                "action_dist": np.array(ac_dists), "logp" : np.array(logps)}
        
        rew_t = path["reward"]
        value = self.policy.predict(path["observation"], path)
        vtarg = common.discount(np.append(rew_t, 0.0 if path["terminated"] else value[-1]), self.gamma)[:-1]
        vpred_t = np.append(value, 0.0 if path["terminated"] else value[-1])
        delta_t = rew_t + self.gamma*vpred_t[1:] - vpred_t[:-1]
        adv_GAE = common.discount(delta_t, self.gamma * self.lam)
        
        if np.mean(self.rewards) >= self.score and not self.finished:
            self.episodes_till_done = self._num_rollouts
            self.frames_till_done = self._num_steps
            self.finished = True      
        
        return path, vtarg, value, adv_GAE