Python constants.ENTROPY_BETA Examples
The following are 6
code examples of constants.ENTROPY_BETA().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
constants
, or try the search function
.
Example #1
Source File: rl_network.py From thor-iqa-cvpr-2018 with Apache License 2.0 | 5 votes |
def rl_loss(self): with tf.variable_scope('a3c_loss'): action_size = self.pi.get_shape().as_list()[1] self.taken_action = tf.placeholder(tf.float32, [None, action_size], name='taken_action') # temporary difference (R-V) (input for policy) self.td = tf.placeholder(tf.float32, [None], name='td_placeholder') # avoid NaN with clipping when value in pi becomes zero log_pi = tf.log(tf.clip_by_value(self.pi, 1e-20, 1.0)) # policy entropy entropy = -tf.reduce_sum(self.pi * log_pi, axis=1) # policy loss (output) (Adding minus, because the original paper's # objective function is for gradient ascent, but we use gradient # descent optimizer.) self.policy_loss = -tf.reduce_mean(tf.reduce_sum( tf.multiply(log_pi, self.taken_action), axis=1) * self.td + entropy * constants.ENTROPY_BETA) # R (input for value) self.r = tf.placeholder(tf.float32, [None], name='reward_placeholder') # value loss (output) # (Learning rate for Critic is half of Actor's, so multiply by 0.5) and half from L2 Loss. self.value_loss = 0.25 * tf.losses.huber_loss(self.r, self.v) # gradienet of policy and value are summed up self.rl_total_loss = self.policy_loss + self.value_loss
Example #2
Source File: training_thread.py From icra2017-visual-navigation with MIT License | 4 votes |
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device, network_scope="network", scene_scope="scene", task_scope="task"): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.network_scope = network_scope self.scene_scope = scene_scope self.task_scope = task_scope self.scopes = [network_scope, scene_scope, task_scope] self.local_network = ActorCriticFFNetwork( action_size=ACTION_SIZE, device=device, network_scope=network_scope, scene_scopes=[scene_scope]) self.local_network.prepare_loss(ENTROPY_BETA, self.scopes) self.trainer = AccumTrainer(device) self.trainer.prepare_minimize(self.local_network.total_loss, self.local_network.get_vars()) self.accum_gradients = self.trainer.accumulate_gradients() self.reset_gradients = self.trainer.reset_gradients() accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()] global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names] self.apply_gradients = grad_applier.apply_gradients( global_net_vars, self.trainer.get_accum_grad_list() ) self.sync = self.local_network.sync_from(global_network) self.env = None self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 self.episode_length = 0 self.episode_max_q = -np.inf
Example #3
Source File: a3c_training_thread.py From a3c-distributed_tensorflow with MIT License | 4 votes |
def __init__(self, thread_index, global_network, pinitial_learning_rate, plearning_rate_input, pgrad_applier, vinitial_learning_rate, vlearning_rate_input, vgrad_applier, max_global_time_step, device,task_index=""): self.thread_index = thread_index self.plearning_rate_input = plearning_rate_input self.vlearning_rate_input = vlearning_rate_input self.max_global_time_step = max_global_time_step self.game_state = GameState() state=self.game_state.reset(); self.game_state.reset_gs(state); self.action_size=self.game_state.action_size; self.state_size=self.game_state.state_size; self.local_max_iter=self.game_state.local_max_iter; if USE_LSTM: self.local_network = GameACLSTMNetwork(self.action_size,self.state_size,self.game_state.action_low,self.game_state.action_high, thread_index, device) else: self.local_network = GameACFFNetwork(self.action_size,self.state_size,self.game_state.action_low,self.game_state.action_high, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): pvar_refs = [v._ref() for v in self.local_network.get_pvars()] self.policy_gradients = tf.gradients( self.local_network.policy_loss, pvar_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) vvar_refs = [v._ref() for v in self.local_network.get_vvars()] self.value_gradients = tf.gradients( self.local_network.value_loss, vvar_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_policy_gradients = pgrad_applier.apply_gradients( self.local_network.get_pvars(), self.policy_gradients ) self.apply_value_gradients = vgrad_applier.apply_gradients( self.local_network.get_vvars(), self.value_gradients ) self.local_t = 0 self.pinitial_learning_rate = pinitial_learning_rate self.vinitial_learning_rate = vinitial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
Example #4
Source File: a3c_training_thread.py From a3c-distributed_tensorflow with MIT License | 4 votes |
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device,task_index=""): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) if(global_network): self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.mode="threading"; else: self.apply_gradients = grad_applier.apply_gradients( self.local_network.get_vars(), self.gradients ) self.mode="dist_tensor"; if not (task_index): self.game_state = GameState(113 * thread_index) else: self.game_state = GameState(113 * task_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
Example #5
Source File: a3c_training_thread.py From async_deep_reinforce with Apache License 2.0 | 4 votes |
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step if USE_LSTM: self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device) else: self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( global_network.get_vars(), self.gradients ) self.sync = self.local_network.sync_from(global_network) self.game_state = GameState(113 * thread_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0
Example #6
Source File: a3c_training_thread.py From pathnet with MIT License | 4 votes |
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, max_global_time_step, device,FLAGS="",task_index=""): self.thread_index = thread_index self.task_index = task_index self.learning_rate_input = learning_rate_input self.max_global_time_step = max_global_time_step self.limit_global_time_step = 100*10**6; if(FLAGS.use_lstm): self.local_network = GameACPathNetLSTMNetwork(ACTION_SIZE, thread_index, device,FLAGS) else: self.local_network = GameACPathNetNetwork(ACTION_SIZE, thread_index, device,FLAGS) self.local_network.prepare_loss(ENTROPY_BETA) with tf.device(device): var_refs = [v._ref() for v in self.local_network.get_vars()] self.gradients = tf.gradients( self.local_network.total_loss, var_refs, gate_gradients=False, aggregation_method=None, colocate_gradients_with_ops=False) self.apply_gradients = grad_applier.apply_gradients( self.local_network.get_vars(), self.gradients ) self.game_state = GameState(113 * task_index) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # variable controling log output self.prev_local_t = 0