Python stable_baselines.DDPG Examples
The following are 11
code examples of stable_baselines.DDPG().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
stable_baselines
, or try the search function
.
Example #1
Source File: test_identity.py From stable-baselines with MIT License | 6 votes |
def test_identity_continuous(model_class): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) """ env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) if model_class in [DDPG, TD3]: n_actions = 1 action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) else: action_noise = None model = model_class("MlpPolicy", env, gamma=0.1, seed=0, action_noise=action_noise, buffer_size=int(1e6)) model.learn(total_timesteps=20000) evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90) # Free memory del model, env
Example #2
Source File: test_0deterministic.py From stable-baselines with MIT License | 6 votes |
def test_deterministic_training_common(algo): results = [[], []] rewards = [[], []] kwargs = {'n_cpu_tf_sess': 1} if algo in [DDPG, TD3, SAC]: env_id = 'Pendulum-v0' kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)}) else: env_id = 'CartPole-v1' if algo == DQN: kwargs.update({'learning_starts': 100}) for i in range(2): model = algo('MlpPolicy', env_id, seed=SEED, **kwargs) model.learn(N_STEPS_TRAINING) env = model.get_env() obs = env.reset() for _ in range(100): action, _ = model.predict(obs, deterministic=False) obs, reward, _, _ = env.step(action) results[i].append(action) rewards[i].append(reward) assert sum(results[0]) == sum(results[1]), results assert sum(rewards[0]) == sum(rewards[1]), rewards
Example #3
Source File: test_her.py From stable-baselines with MIT License | 6 votes |
def test_long_episode(model_class): """ Check that the model does not break when the replay buffer is still empty after the first rollout (because the episode is not over). """ # n_bits > nb_rollout_steps n_bits = 10 env = BitFlippingEnv(n_bits, continuous=model_class in [DDPG, SAC, TD3], max_steps=n_bits) kwargs = {} if model_class == DDPG: kwargs['nb_rollout_steps'] = 9 # < n_bits elif model_class in [DQN, SAC, TD3]: kwargs['batch_size'] = 8 # < n_bits kwargs['learning_starts'] = 0 model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy='future', verbose=0, **kwargs) model.learn(200)
Example #4
Source File: hyperparams_opt.py From rl-baselines-zoo with MIT License | 6 votes |
def sample_her_params(trial): """ Sampler for HER hyperparams. :param trial: (optuna.trial) :return: (dict) """ if trial.model_class == SAC: hyperparams = sample_sac_params(trial) elif trial.model_class == DDPG: hyperparams = sample_ddpg_params(trial) elif trial.model_class == TD3: hyperparams = sample_td3_params(trial) hyperparams['random_exploration'] = trial.suggest_uniform('random_exploration', 0, 1) hyperparams['n_sampled_goal'] = trial.suggest_categorical('n_sampled_goal', [1, 2, 4, 6, 8]) return hyperparams
Example #5
Source File: test_her.py From stable-baselines with MIT License | 5 votes |
def test_her(model_class, goal_selection_strategy, discrete_obs_space): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS, discrete_obs_space=discrete_obs_space) # Take random actions 10% of the time kwargs = {'random_exploration': 0.1} if model_class in [DDPG, SAC, TD3] else {} model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy, verbose=0, **kwargs) model.learn(1000)
Example #6
Source File: hyperparams_opt.py From rl-baselines-zoo with MIT License | 5 votes |
def sample_ddpg_params(trial): """ Sampler for DDPG hyperparams. :param trial: (optuna.trial) :return: (dict) """ gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]) # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1) # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1) learning_rate = trial.suggest_loguniform('lr', 1e-5, 1) batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256]) buffer_size = trial.suggest_categorical('memory_limit', [int(1e4), int(1e5), int(1e6)]) noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param']) noise_std = trial.suggest_uniform('noise_std', 0, 1) normalize_observations = trial.suggest_categorical('normalize_observations', [True, False]) normalize_returns = trial.suggest_categorical('normalize_returns', [True, False]) hyperparams = { 'gamma': gamma, 'actor_lr': learning_rate, 'critic_lr': learning_rate, 'batch_size': batch_size, 'memory_limit': buffer_size, 'normalize_observations': normalize_observations, 'normalize_returns': normalize_returns } if noise_type == 'adaptive-param': hyperparams['param_noise'] = AdaptiveParamNoiseSpec(initial_stddev=noise_std, desired_action_stddev=noise_std) # Apply layer normalization when using parameter perturbation hyperparams['policy_kwargs'] = dict(layer_norm=True) elif noise_type == 'normal': hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) elif noise_type == 'ornstein-uhlenbeck': hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)) return hyperparams
Example #7
Source File: train_maneuver_DDPG.py From flappy with MIT License | 5 votes |
def main(args): start = time.time() env_id = 'fwmav_maneuver-v0' env = DummyVecEnv([make_env(env_id, 0)]) # env = SubprocVecEnv([make_env(env_id, i) for i in range(args.n_cpu)]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG( policy = MyDDPGPolicy, env = env, gamma = 1.0, nb_train_steps=5000, nb_rollout_steps=10000, nb_eval_steps=10000, param_noise=param_noise, action_noise=action_noise, tau=0.003, batch_size=256, observation_range=(-np.inf, np.inf), actor_lr=0.0001, critic_lr=0.001, reward_scale=0.05, memory_limit=10000000, verbose=1, ) model.learn(total_timesteps=args.time_step) model.save(args.model_path) end = time.time() print("Time used: ", end - start)
Example #8
Source File: train_DDPG.py From flappy with MIT License | 5 votes |
def main(args): start = time.time() env_id = 'fwmav_hover-v0' env = DummyVecEnv([make_env(env_id, 0)]) # env = SubprocVecEnv([make_env(env_id, i) for i in range(args.n_cpu)]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) model = DDPG( policy = MyDDPGPolicy, env = env, gamma = 1.0, nb_train_steps=5000, nb_rollout_steps=10000, nb_eval_steps=10000, param_noise=param_noise, action_noise=action_noise, tau=0.003, batch_size=256, observation_range=(-np.inf, np.inf), actor_lr=0.0001, critic_lr=0.001, reward_scale=0.05, memory_limit=10000000, verbose=1, ) model.learn(total_timesteps=args.time_step) model.save(args.model_path) end = time.time() print("Time used: ", end - start)
Example #9
Source File: ddpg.py From robotics-rl-srl with MIT License | 5 votes |
def __init__(self): super(DDPGModel, self).__init__(name="ddpg", model_class=DDPG)
Example #10
Source File: train_ddpg_pushing.py From pybullet-robot-envs with GNU Lesser General Public License v2.1 | 5 votes |
def main(): # create Environment env = iCubPushGymEnv(renders=False, use_IK=1, obj_pose_rnd_std=0, max_steps=2000, reward_type=0) # set seed seed = 1 tf.reset_default_graph() set_global_seed(seed) env.seed(seed) # set log monitor_dir = os.path.join(log_dir, 'log') os.makedirs(monitor_dir, exist_ok=True) env = Monitor(env, monitor_dir + '/', allow_early_resets=True) # create agent model nb_actions = env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(0.5373) * np.ones(nb_actions)) model = DDPG('LnMlpPolicy', env, action_noise=action_noise, gamma=0.99, batch_size=16, normalize_observations=True, normalize_returns=False, memory_limit=100000, verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False) # start learning model.learn(total_timesteps=500000, seed=seed, callback=callback) # save model print("Saving model.pkl to ",log_dir) model.save(log_dir+"/final_model.pkl")
Example #11
Source File: test_her.py From stable-baselines with MIT License | 4 votes |
def test_model_manipulation(model_class, goal_selection_strategy): env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) env = DummyVecEnv([lambda: env]) model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy, verbose=0) model.learn(1000) model_predict(model, env, n_steps=100, additional_check=None) model.save('./test_her.zip') del model # NOTE: HER does not support VecEnvWrapper yet with pytest.raises(AssertionError): model = HER.load('./test_her.zip', env=VecNormalize(env)) model = HER.load('./test_her.zip') # Check that the model raises an error when the env # is not wrapped (or no env passed to the model) with pytest.raises(ValueError): model.predict(env.reset()) env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) env_ = HERGoalEnvWrapper(env_) model_predict(model, env_, n_steps=100, additional_check=None) model.set_env(env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 del model env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS) model = HER.load('./test_her', env=env) model.learn(1000) model_predict(model, env_, n_steps=100, additional_check=None) assert model.n_sampled_goal == 3 if os.path.isfile('./test_her.zip'): os.remove('./test_her.zip')