Python baselines.ppo1.mlp_policy.MlpPolicy() Examples
The following are 30
code examples of baselines.ppo1.mlp_policy.MlpPolicy().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
baselines.ppo1.mlp_policy
, or try the search function
.
Example #1
Source File: run_robotics.py From ICML2019-TREX with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #2
Source File: run_mujoco.py From sonic_contest with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
Example #3
Source File: run_mujoco.py From self-imitation-learning with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
Example #4
Source File: run_robotics.py From baselines with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #5
Source File: run_robotics.py From ICML2019-TREX with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #6
Source File: run_mujoco.py From deeprl-baselines with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
Example #7
Source File: run_mujoco.py From DRL_DeliveryDuel with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
Example #8
Source File: run_mujoco.py From deeprl-baselines with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #9
Source File: run_mujoco.py From rl_graph_generation with BSD 3-Clause "New" or "Revised" License | 6 votes |
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
Example #10
Source File: run_mujoco.py From BackpropThroughTheVoidRL with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
Example #11
Source File: run_mujoco.py From BackpropThroughTheVoidRL with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir()) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #12
Source File: run_robotics.py From HardRLWithYoutube with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #13
Source File: run_mujoco.py From lirpg with MIT License | 6 votes |
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) env = make_mujoco_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
Example #14
Source File: run_mujoco.py From lirpg with MIT License | 5 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #15
Source File: run_energyplus.py From rl-testbed-for-energyplus with MIT License | 5 votes |
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create a new base directory like /tmp/openai-2018-05-21-12-27-22-552435 log_dir = os.path.join(energyplus_logbase_dir(), datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f")) if not os.path.exists(log_dir + '/output'): os.makedirs(log_dir + '/output') os.environ["ENERGYPLUS_LOG"] = log_dir model = os.getenv('ENERGYPLUS_MODEL') if model is None: print('Environment variable ENERGYPLUS_MODEL is not defined') os.exit() weather = os.getenv('ENERGYPLUS_WEATHER') if weather is None: print('Environment variable ENERGYPLUS_WEATHER is not defined') os.exit() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: print('train: init logger with dir={}'.format(log_dir)) #XXX logger.configure(log_dir) else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) env = make_energyplus_env(env_id, workerseed) trpo_mpi.learn(env, policy_fn, max_timesteps=num_timesteps, #timesteps_per_batch=1*1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, timesteps_per_batch=16*1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
Example #16
Source File: run_mujoco.py From baselines with MIT License | 5 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #17
Source File: run_humanoid.py From baselines with MIT License | 5 votes |
def train(num_timesteps, seed, model_path=None): env_id = 'Humanoid-v2' from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) logger.log("NOTE: reward will be scaled by a factor of 10 in logged stats. Check the monitor for unscaled reward.") pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.1, entcoeff=0.0, optim_epochs=10, optim_stepsize=1e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='constant', ) env.close() if model_path: U.save_state(model_path) return pi
Example #18
Source File: run_mujoco.py From self-imitation-learning with MIT License | 5 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #19
Source File: run_mujoco.py From sonic_contest with MIT License | 5 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #20
Source File: run_humanoid.py From ICML2019-TREX with MIT License | 5 votes |
def train(num_timesteps, seed, model_path=None): env_id = 'Humanoid-v2' from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: U.save_state(model_path) return pi
Example #21
Source File: run_mujoco.py From ICML2019-TREX with MIT License | 5 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #22
Source File: run_humanoid.py From ICML2019-TREX with MIT License | 5 votes |
def train(num_timesteps, seed, model_path=None): env_id = 'Humanoid-v2' from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: U.save_state(model_path) return pi
Example #23
Source File: run_mujoco.py From DRL_DeliveryDuel with MIT License | 5 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #24
Source File: run_mujoco.py From rl_graph_generation with BSD 3-Clause "New" or "Revised" License | 5 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #25
Source File: run_mujoco.py From HardRLWithYoutube with MIT License | 5 votes |
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
Example #26
Source File: run_humanoid.py From HardRLWithYoutube with MIT License | 5 votes |
def train(num_timesteps, seed, model_path=None): env_id = 'Humanoid-v2' from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random search # these are good enough to make humanoid walk, but whether those are # an absolute best or not is not certain env = RewScale(env, 0.1) pi = pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: U.save_state(model_path) return pi
Example #27
Source File: sim_double_pendulum.py From SenseAct with BSD 3-Clause "New" or "Revised" License | 4 votes |
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) #Create Asynchronous Simulation of InvertedDoublePendulum-v2 mujoco environment. env = DoubleInvertedPendulumEnv(agent_dt=0.005, sensor_dt=[0.01, 0.0033333], is_render=False, random_state=rand_state ) # Start environment processes env.start() # Create baselines ppo policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) # create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({"write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Plotting process pp = Process(target=plot_returns, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines PPO learn kindred_callback = create_callback(shared_returns) # Train baselines PPO learn(env, policy_fn, max_timesteps=1e6, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=0.0001, optim_batchsize=64, gamma=0.995, lam=0.995, schedule="linear", callback=kindred_callback, ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() # Shutdown the environment env.close()
Example #28
Source File: create2_mover.py From SenseAct with BSD 3-Clause "New" or "Revised" License | 4 votes |
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 mover environment env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({"write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_mover, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
Example #29
Source File: create2_docker.py From SenseAct with BSD 3-Clause "New" or "Revised" License | 4 votes |
def main(): # use fixed random state rand_state = np.random.RandomState(1).get_state() np.random.set_state(rand_state) tf_set_seeds(np.random.randint(1, 2**31 - 1)) # Create the Create2 docker environment env = Create2DockerEnv(30, port='/dev/ttyUSB0', ir_window=20, ir_history=1, obs_history=1, dt=0.045, random_state=rand_state) env = NormalizedEnv(env) # Start environment processes env.start() # Create baselines TRPO policy function sess = U.single_threaded_session() sess.__enter__() def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) # Create and start plotting process plot_running = Value('i', 1) shared_returns = Manager().dict({"write_lock": False, "episodic_returns": [], "episodic_lengths": [], }) # Spawn plotting process pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running)) pp.start() # Create callback function for logging data from baselines TRPO learn kindred_callback = create_callback(shared_returns) # Train baselines TRPO learn(env, policy_fn, max_timesteps=40000, timesteps_per_batch=2048, max_kl=0.05, cg_iters=10, cg_damping=0.1, vf_iters=5, vf_stepsize=0.001, gamma=0.995, lam=0.995, callback=kindred_callback ) # Safely terminate plotter process plot_running.value = 0 # shutdown ploting process time.sleep(2) pp.join() env.close()
Example #30
Source File: ppo_baselines_train.py From gymfc with MIT License | 4 votes |
def train(env, num_timesteps, seed, ckpt_dir=None, render=False, ckpt_freq=0, restore_dir=None, optim_stepsize=3e-4, schedule="linear", gamma=0.99, optim_epochs=10, optim_batchsize=64, horizon=2048): from baselines.common.fc_learning_utils import FlightLog from mpi4py import MPI from baselines import logger from baselines.ppo1.mlp_policy import MlpPolicy from baselines.common import set_global_seeds from baselines.ppo1 import pposgd_simple import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) workerseed = seed + 1000000 * rank def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) if render: env.render() env.seed(workerseed) set_global_seeds(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=horizon, clip_param=0.2, entcoeff=0.0, optim_epochs=optim_epochs, optim_stepsize=optim_stepsize, optim_batchsize=optim_batchsize, gamma=0.99, lam=0.95, schedule=schedule, flight_log = None, ckpt_dir = ckpt_dir, restore_dir = restore_dir, save_timestep_period= ckpt_freq ) env.close()