Python baselines.common.tf_util.function() Examples
The following are 30
code examples of baselines.common.tf_util.function().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
baselines.common.tf_util
, or try the search function
.
Example #1
Source File: acktr_cont.py From DRL_DeliveryDuel with MIT License | 5 votes |
def pathlength(path): return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
Example #2
Source File: mpi_running_mean_std.py From DRL_DeliveryDuel with MIT License | 5 votes |
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
Example #3
Source File: adversary.py From ICML2019-TREX with MIT License | 5 votes |
def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"): self.scope = scope self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)]) self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff*entropy # Loss + Accuracy terms self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc] self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8) var_list = self.get_trainable_variables() self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], self.losses + [U.flatgrad(self.total_loss, var_list)])
Example #4
Source File: distributions.py From DRL_DeliveryDuel with MIT License | 5 votes |
def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdfromflat(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval}) logliks = calcloglik(Xval, Mval) entval_ll = - logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdfromflat(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = - entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas print('ok on', probtype, pdparam)
Example #5
Source File: mpi_adam.py From DRL_DeliveryDuel with MIT License | 5 votes |
def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2,5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for i in range(10): print(i,do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a,b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for i in range(10): l,g = lossandgrad() adam.update(g, stepsize) print(i,l)
Example #6
Source File: mpi_adam.py From ICML2019-TREX with MIT License | 5 votes |
def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2,5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) losslist_ref = [] for i in range(10): l = do_update() print(i, l) losslist_ref.append(l) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a,b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)]) adam = MpiAdam(var_list) losslist_test = [] for i in range(10): l,g = lossandgrad() adam.update(g, stepsize) print(i,l) losslist_test.append(l) np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4)
Example #7
Source File: distributions.py From ICML2019-TREX with MIT License | 5 votes |
def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdfromflat(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval}) logliks = calcloglik(Xval, Mval) entval_ll = - logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdfromflat(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = - entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas print('ok on', probtype, pdparam)
Example #8
Source File: mlp_policy.py From ICML2019-TREX with MIT License | 5 votes |
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0))) self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #9
Source File: behavior_clone.py From ICML2019-TREX with MIT License | 5 votes |
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters/10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = osp.join(ckpt_dir, task_name) U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
Example #10
Source File: cnn_policy.py From ICML2019-TREX with MIT License | 5 votes |
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #11
Source File: mlp_policy.py From ICML2019-TREX with MIT License | 5 votes |
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) with tf.variable_scope('vf'): obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] with tf.variable_scope('pol'): last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #12
Source File: mpi_running_mean_std.py From ICML2019-TREX with MIT License | 5 votes |
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
Example #13
Source File: mpi_adam.py From ICML2019-TREX with MIT License | 5 votes |
def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2,5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) losslist_ref = [] for i in range(10): l = do_update() print(i, l) losslist_ref.append(l) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a,b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)]) adam = MpiAdam(var_list) losslist_test = [] for i in range(10): l,g = lossandgrad() adam.update(g, stepsize) print(i,l) losslist_test.append(l) np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4)
Example #14
Source File: mpi_running_mean_std.py From ICML2019-TREX with MIT License | 5 votes |
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
Example #15
Source File: nosharing_cnn_policy.py From DRL_DeliveryDuel with MIT License | 5 votes |
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #16
Source File: cnn_policy.py From DRL_DeliveryDuel with MIT License | 5 votes |
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #17
Source File: policies.py From DRL_DeliveryDuel with MIT License | 5 votes |
def __init__(self, ob_dim, ac_dim): # Here we'll construct a bunch of expressions, which will be used in two places: # (1) When sampling actions # (2) When computing loss functions, for the policy update # Variables specific to (1) have the word "sampled" in them, # whereas variables specific to (2) have the word "old" in them ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate wd_dict = {} h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output self.wd_dict = wd_dict self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs logstd_1a = tf.expand_dims(logstd_1a, 0) std_1a = tf.exp(logstd_1a) std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy self.compute_kl = U.function([ob_no, oldac_dist], kl) self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss U.initialize() # Initialize uninitialized TF variables
Example #18
Source File: behavior_clone.py From DRL_DeliveryDuel with MIT License | 5 votes |
def learn(env, policy_func, dataset, optim_batch_size=128, max_iters=1e4, adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None, verbose=False): val_per_iter = int(max_iters/10) ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(ac-pi.ac)) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Pretraining with Behavior Cloning...") for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format(train_loss, val_loss)) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: savedir_fname = osp.join(ckpt_dir, task_name) U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
Example #19
Source File: mlp_policy.py From DRL_DeliveryDuel with MIT License | 5 votes |
def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) with tf.variable_scope("obfilter"): self.ob_rms = RunningMeanStd(shape=ob_space.shape) obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0))) self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] last_out = obz for i in range(num_hid_layers): last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0))) if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) else: pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(pdparam) self.state_in = [] self.state_out = [] # change for BC stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.ac = ac self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #20
Source File: adversary.py From DRL_DeliveryDuel with MIT License | 5 votes |
def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"): self.scope = scope self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)]) self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff*entropy # Loss + Accuracy terms self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc] self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8) var_list = self.get_trainable_variables() self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], self.losses + [U.flatgrad(self.total_loss, var_list)])
Example #21
Source File: distributions.py From learning2run with MIT License | 5 votes |
def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdclass()(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = U.eval(pd.sample(), feed_dict={M:Mval}) logliks = calcloglik(Xval, Mval) entval_ll = - logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdclass()(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = - entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
Example #22
Source File: mpi_adam.py From learning2run with MIT License | 5 votes |
def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2,5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for i in range(10): print(i,do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a,b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for i in range(10): l,g = lossandgrad() adam.update(g, stepsize) print(i,l)
Example #23
Source File: mpi_running_mean_std.py From learning2run with MIT License | 5 votes |
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
Example #24
Source File: cnn_policy.py From learning2run with MIT License | 5 votes |
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0))) else: raise NotImplementedError logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #25
Source File: nosharing_cnn_policy.py From learning2run with MIT License | 5 votes |
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0))) self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #26
Source File: adversary.py From lirpg with MIT License | 5 votes |
def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"): self.scope = scope self.observation_shape = env.observation_space.shape self.actions_shape = env.action_space.shape self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)]) self.num_actions = env.action_space.shape[0] self.hidden_size = hidden_size self.build_ph() # Build grpah generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) # Build accuracy generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) # Build regression loss # let x = logits, z = targets. # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits)) generator_loss = tf.reduce_mean(generator_loss) expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits)) expert_loss = tf.reduce_mean(expert_loss) # Build entropy loss logits = tf.concat([generator_logits, expert_logits], 0) entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) entropy_loss = -entcoeff*entropy # Loss + Accuracy terms self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc] self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"] self.total_loss = generator_loss + expert_loss + entropy_loss # Build Reward for policy self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8) var_list = self.get_trainable_variables() self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], self.losses + [U.flatgrad(self.total_loss, var_list)])
Example #27
Source File: mpi_adam.py From rl_graph_generation with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_MpiAdam(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2,5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) do_update = U.function([], loss, updates=[update_op]) tf.get_default_session().run(tf.global_variables_initializer()) for i in range(10): print(i,do_update()) tf.set_random_seed(0) tf.get_default_session().run(tf.global_variables_initializer()) var_list = [a,b] lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) adam = MpiAdam(var_list) for i in range(10): l,g = lossandgrad() adam.update(g, stepsize) print(i,l)
Example #28
Source File: mpi_running_mean_std.py From rl_graph_generation with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, epsilon=1e-2, shape=()): self._sum = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(0.0), name="runningsum", trainable=False) self._sumsq = tf.get_variable( dtype=tf.float64, shape=shape, initializer=tf.constant_initializer(epsilon), name="runningsumsq", trainable=False) self._count = tf.get_variable( dtype=tf.float64, shape=(), initializer=tf.constant_initializer(epsilon), name="count", trainable=False) self.shape = shape self.mean = tf.to_float(self._sum / self._count) self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') self.incfiltparams = U.function([newsum, newsumsq, newcount], [], updates=[tf.assign_add(self._sum, newsum), tf.assign_add(self._sumsq, newsumsq), tf.assign_add(self._count, newcount)])
Example #29
Source File: cnn_policy.py From rl_graph_generation with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _init(self, ob_space, ac_space, kind): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) x = ob / 255.0 if kind == 'small': # from A3C paper x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) elif kind == 'large': # Nature DQN x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) else: raise NotImplementedError logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() # XXX self._act = U.function([stochastic, ob], [ac, self.vpred])
Example #30
Source File: nosharing_cnn_policy.py From rl_graph_generation with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _init(self, ob_space, ac_space): assert isinstance(ob_space, gym.spaces.Box) self.pdtype = pdtype = make_pdtype(ac_space) sequence_length = None ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) obscaled = ob / 255.0 with tf.variable_scope("pol"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0))) logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) self.pd = pdtype.pdfromflat(logits) with tf.variable_scope("vf"): x = obscaled x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID")) x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID")) x = U.flattenallbut0(x) x = tf.nn.relu(tf.layers.dense(x, 128, name='lin', kernel_initializer=U.normc_initializer(1.0))) self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0)) self.vpredz = self.vpred self.state_in = [] self.state_out = [] stochastic = tf.placeholder(dtype=tf.bool, shape=()) ac = self.pd.sample() self._act = U.function([stochastic, ob], [ac, self.vpred])