Python stable_baselines.DDPG Examples

The following are 11 code examples of stable_baselines.DDPG(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module stable_baselines , or try the search function .
Example #1
Source File: test_identity.py    From stable-baselines with MIT License 6 votes vote down vote up
def test_identity_continuous(model_class):
    """
    Test if the algorithm (with a given policy)
    can learn an identity transformation (i.e. return observation as an action)
    """
    env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

    if model_class in [DDPG, TD3]:
        n_actions = 1
        action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    else:
        action_noise = None

    model = model_class("MlpPolicy", env, gamma=0.1, seed=0,
                         action_noise=action_noise, buffer_size=int(1e6))
    model.learn(total_timesteps=20000)

    evaluate_policy(model, env, n_eval_episodes=20, reward_threshold=90)
    # Free memory
    del model, env 
Example #2
Source File: test_0deterministic.py    From stable-baselines with MIT License 6 votes vote down vote up
def test_deterministic_training_common(algo):
    results = [[], []]
    rewards = [[], []]
    kwargs = {'n_cpu_tf_sess': 1}
    if algo in [DDPG, TD3, SAC]:
        env_id = 'Pendulum-v0'
        kwargs.update({'action_noise': NormalActionNoise(0.0, 0.1)})
    else:
        env_id = 'CartPole-v1'
        if algo == DQN:
            kwargs.update({'learning_starts': 100})

    for i in range(2):
        model = algo('MlpPolicy', env_id, seed=SEED, **kwargs)
        model.learn(N_STEPS_TRAINING)
        env = model.get_env()
        obs = env.reset()
        for _ in range(100):
            action, _ = model.predict(obs, deterministic=False)
            obs, reward, _, _ = env.step(action)
            results[i].append(action)
            rewards[i].append(reward)
    assert sum(results[0]) == sum(results[1]), results
    assert sum(rewards[0]) == sum(rewards[1]), rewards 
Example #3
Source File: test_her.py    From stable-baselines with MIT License 6 votes vote down vote up
def test_long_episode(model_class):
    """
    Check that the model does not break when the replay buffer is still empty
    after the first rollout (because the episode is not over).
    """
    # n_bits > nb_rollout_steps
    n_bits = 10
    env = BitFlippingEnv(n_bits, continuous=model_class in [DDPG, SAC, TD3],
                         max_steps=n_bits)
    kwargs = {}
    if model_class == DDPG:
        kwargs['nb_rollout_steps'] = 9  # < n_bits
    elif model_class in [DQN, SAC, TD3]:
        kwargs['batch_size'] = 8  # < n_bits
        kwargs['learning_starts'] = 0

    model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy='future',
                verbose=0, **kwargs)
    model.learn(200) 
Example #4
Source File: hyperparams_opt.py    From rl-baselines-zoo with MIT License 6 votes vote down vote up
def sample_her_params(trial):
    """
    Sampler for HER hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    if trial.model_class == SAC:
        hyperparams = sample_sac_params(trial)
    elif trial.model_class == DDPG:
        hyperparams = sample_ddpg_params(trial)
    elif trial.model_class == TD3:
        hyperparams = sample_td3_params(trial)

    hyperparams['random_exploration'] = trial.suggest_uniform('random_exploration', 0, 1)
    hyperparams['n_sampled_goal'] = trial.suggest_categorical('n_sampled_goal', [1, 2, 4, 6, 8])

    return hyperparams 
Example #5
Source File: test_her.py    From stable-baselines with MIT License 5 votes vote down vote up
def test_her(model_class, goal_selection_strategy, discrete_obs_space):
    env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3],
                         max_steps=N_BITS, discrete_obs_space=discrete_obs_space)

    # Take random actions 10% of the time
    kwargs = {'random_exploration': 0.1} if model_class in [DDPG, SAC, TD3] else {}
    model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=0, **kwargs)
    model.learn(1000) 
Example #6
Source File: hyperparams_opt.py    From rl-baselines-zoo with MIT License 5 votes vote down vote up
def sample_ddpg_params(trial):
    """
    Sampler for DDPG hyperparams.

    :param trial: (optuna.trial)
    :return: (dict)
    """
    gamma = trial.suggest_categorical('gamma', [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    # actor_lr = trial.suggest_loguniform('actor_lr', 1e-5, 1)
    # critic_lr = trial.suggest_loguniform('critic_lr', 1e-5, 1)
    learning_rate = trial.suggest_loguniform('lr', 1e-5, 1)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64, 128, 256])
    buffer_size = trial.suggest_categorical('memory_limit', [int(1e4), int(1e5), int(1e6)])
    noise_type = trial.suggest_categorical('noise_type', ['ornstein-uhlenbeck', 'normal', 'adaptive-param'])
    noise_std = trial.suggest_uniform('noise_std', 0, 1)
    normalize_observations = trial.suggest_categorical('normalize_observations', [True, False])
    normalize_returns = trial.suggest_categorical('normalize_returns', [True, False])

    hyperparams = {
        'gamma': gamma,
        'actor_lr': learning_rate,
        'critic_lr': learning_rate,
        'batch_size': batch_size,
        'memory_limit': buffer_size,
        'normalize_observations': normalize_observations,
        'normalize_returns': normalize_returns
    }

    if noise_type == 'adaptive-param':
        hyperparams['param_noise'] = AdaptiveParamNoiseSpec(initial_stddev=noise_std,
                                                            desired_action_stddev=noise_std)
        # Apply layer normalization when using parameter perturbation
        hyperparams['policy_kwargs'] = dict(layer_norm=True)
    elif noise_type == 'normal':
        hyperparams['action_noise'] = NormalActionNoise(mean=np.zeros(trial.n_actions),
                                                        sigma=noise_std * np.ones(trial.n_actions))
    elif noise_type == 'ornstein-uhlenbeck':
        hyperparams['action_noise'] = OrnsteinUhlenbeckActionNoise(mean=np.zeros(trial.n_actions),
                                                                   sigma=noise_std * np.ones(trial.n_actions))
    return hyperparams 
Example #7
Source File: train_maneuver_DDPG.py    From flappy with MIT License 5 votes vote down vote up
def main(args):

	start = time.time()

	env_id = 'fwmav_maneuver-v0'
	env = DummyVecEnv([make_env(env_id, 0)])
	# env = SubprocVecEnv([make_env(env_id, i) for i in range(args.n_cpu)])

	n_actions = env.action_space.shape[-1]
	param_noise = None
	action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

	model = DDPG(
			policy = MyDDPGPolicy,
			env = env,
			gamma = 1.0,
			nb_train_steps=5000,
			nb_rollout_steps=10000,
			nb_eval_steps=10000,
			param_noise=param_noise,
			action_noise=action_noise,
			tau=0.003,
			batch_size=256,
			observation_range=(-np.inf, np.inf),
			actor_lr=0.0001,
			critic_lr=0.001,
			reward_scale=0.05,
			memory_limit=10000000,
			verbose=1,
	)

	model.learn(total_timesteps=args.time_step)
	model.save(args.model_path)

	end = time.time()
	print("Time used: ", end - start) 
Example #8
Source File: train_DDPG.py    From flappy with MIT License 5 votes vote down vote up
def main(args):

	start = time.time()

	env_id = 'fwmav_hover-v0'
	env = DummyVecEnv([make_env(env_id, 0)])
	# env = SubprocVecEnv([make_env(env_id, i) for i in range(args.n_cpu)])

	n_actions = env.action_space.shape[-1]
	param_noise = None
	action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))

	model = DDPG(
			policy = MyDDPGPolicy,
			env = env,
			gamma = 1.0,
			nb_train_steps=5000,
			nb_rollout_steps=10000,
			nb_eval_steps=10000,
			param_noise=param_noise,
			action_noise=action_noise,
			tau=0.003,
			batch_size=256,
			observation_range=(-np.inf, np.inf),
			actor_lr=0.0001,
			critic_lr=0.001,
			reward_scale=0.05,
			memory_limit=10000000,
			verbose=1,
	)

	model.learn(total_timesteps=args.time_step)
	model.save(args.model_path)

	end = time.time()
	print("Time used: ", end - start) 
Example #9
Source File: ddpg.py    From robotics-rl-srl with MIT License 5 votes vote down vote up
def __init__(self):
        super(DDPGModel, self).__init__(name="ddpg", model_class=DDPG) 
Example #10
Source File: train_ddpg_pushing.py    From pybullet-robot-envs with GNU Lesser General Public License v2.1 5 votes vote down vote up
def main():
    # create Environment
    env = iCubPushGymEnv(renders=False, use_IK=1, obj_pose_rnd_std=0, max_steps=2000, reward_type=0)

    # set seed
    seed = 1
    tf.reset_default_graph()
    set_global_seed(seed)
    env.seed(seed)

    # set log
    monitor_dir = os.path.join(log_dir, 'log')
    os.makedirs(monitor_dir, exist_ok=True)
    env = Monitor(env, monitor_dir + '/', allow_early_resets=True)

    # create agent model
    nb_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(0.5373) * np.ones(nb_actions))

    model = DDPG('LnMlpPolicy', env, action_noise=action_noise, gamma=0.99, batch_size=16,
                 normalize_observations=True, normalize_returns=False, memory_limit=100000,
                 verbose=1, tensorboard_log=os.path.join(log_dir, 'tb'), full_tensorboard_log=False)

    # start learning
    model.learn(total_timesteps=500000, seed=seed, callback=callback)

    # save model
    print("Saving model.pkl to ",log_dir)
    model.save(log_dir+"/final_model.pkl") 
Example #11
Source File: test_her.py    From stable-baselines with MIT License 4 votes vote down vote up
def test_model_manipulation(model_class, goal_selection_strategy):
    env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)
    env = DummyVecEnv([lambda: env])

    model = HER('MlpPolicy', env, model_class, n_sampled_goal=3, goal_selection_strategy=goal_selection_strategy,
                verbose=0)
    model.learn(1000)

    model_predict(model, env, n_steps=100, additional_check=None)

    model.save('./test_her.zip')
    del model

    # NOTE: HER does not support VecEnvWrapper yet
    with pytest.raises(AssertionError):
        model = HER.load('./test_her.zip', env=VecNormalize(env))

    model = HER.load('./test_her.zip')

    # Check that the model raises an error when the env
    # is not wrapped (or no env passed to the model)
    with pytest.raises(ValueError):
        model.predict(env.reset())

    env_ = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)
    env_ = HERGoalEnvWrapper(env_)

    model_predict(model, env_, n_steps=100, additional_check=None)

    model.set_env(env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    del model

    env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3], max_steps=N_BITS)
    model = HER.load('./test_her', env=env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    if os.path.isfile('./test_her.zip'):
        os.remove('./test_her.zip')