Python constants.ENTROPY_BETA Examples

The following are 6 code examples of constants.ENTROPY_BETA(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module constants , or try the search function .
Example #1
Source File: rl_network.py    From thor-iqa-cvpr-2018 with Apache License 2.0 5 votes vote down vote up
def rl_loss(self):
        with tf.variable_scope('a3c_loss'):
            action_size = self.pi.get_shape().as_list()[1]
            self.taken_action = tf.placeholder(tf.float32, [None, action_size], name='taken_action')

            # temporary difference (R-V) (input for policy)
            self.td = tf.placeholder(tf.float32, [None], name='td_placeholder')

            # avoid NaN with clipping when value in pi becomes zero
            log_pi = tf.log(tf.clip_by_value(self.pi, 1e-20, 1.0))

            # policy entropy
            entropy = -tf.reduce_sum(self.pi * log_pi, axis=1)

            # policy loss (output)  (Adding minus, because the original paper's
            # objective function is for gradient ascent, but we use gradient
            # descent optimizer.)
            self.policy_loss = -tf.reduce_mean(tf.reduce_sum(
                tf.multiply(log_pi, self.taken_action), axis=1) * self.td + entropy * constants.ENTROPY_BETA)

            # R (input for value)
            self.r = tf.placeholder(tf.float32, [None], name='reward_placeholder')

            # value loss (output)
            # (Learning rate for Critic is half of Actor's, so multiply by 0.5) and half from L2 Loss.
            self.value_loss = 0.25 * tf.losses.huber_loss(self.r, self.v)

            # gradienet of policy and value are summed up
            self.rl_total_loss = self.policy_loss + self.value_loss 
Example #2
Source File: training_thread.py    From icra2017-visual-navigation with MIT License 4 votes vote down vote up
def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,
               network_scope="network",
               scene_scope="scene",
               task_scope="task"):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    self.network_scope = network_scope
    self.scene_scope = scene_scope
    self.task_scope = task_scope
    self.scopes = [network_scope, scene_scope, task_scope]

    self.local_network = ActorCriticFFNetwork(
                           action_size=ACTION_SIZE,
                           device=device,
                           network_scope=network_scope,
                           scene_scopes=[scene_scope])

    self.local_network.prepare_loss(ENTROPY_BETA, self.scopes)

    self.trainer = AccumTrainer(device)
    self.trainer.prepare_minimize(self.local_network.total_loss,
                                  self.local_network.get_vars())

    self.accum_gradients = self.trainer.accumulate_gradients()
    self.reset_gradients = self.trainer.reset_gradients()

    accum_grad_names = [self._local_var_name(x) for x in self.trainer.get_accum_grad_list()]
    global_net_vars = [x for x in global_network.get_vars() if self._get_accum_grad_name(x) in accum_grad_names]

    self.apply_gradients = grad_applier.apply_gradients(
      global_net_vars, self.trainer.get_accum_grad_list() )

    self.sync = self.local_network.sync_from(global_network)

    self.env = None

    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0
    self.episode_length = 0
    self.episode_max_q = -np.inf 
Example #3
Source File: a3c_training_thread.py    From a3c-distributed_tensorflow with MIT License 4 votes vote down vote up
def __init__(self,
               thread_index,
               global_network,
               pinitial_learning_rate,
               plearning_rate_input,
               pgrad_applier,
               vinitial_learning_rate,
               vlearning_rate_input,
               vgrad_applier,
               max_global_time_step,
               device,task_index=""):

    self.thread_index = thread_index
    self.plearning_rate_input = plearning_rate_input
    self.vlearning_rate_input = vlearning_rate_input
    self.max_global_time_step = max_global_time_step
    self.game_state = GameState()
    state=self.game_state.reset();
    self.game_state.reset_gs(state);
    self.action_size=self.game_state.action_size;
    self.state_size=self.game_state.state_size;
    self.local_max_iter=self.game_state.local_max_iter;

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(self.action_size,self.state_size,self.game_state.action_low,self.game_state.action_high, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(self.action_size,self.state_size,self.game_state.action_low,self.game_state.action_high, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      pvar_refs = [v._ref() for v in self.local_network.get_pvars()]
      self.policy_gradients = tf.gradients(
        self.local_network.policy_loss, pvar_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)
      vvar_refs = [v._ref() for v in self.local_network.get_vvars()]
      self.value_gradients = tf.gradients(
        self.local_network.value_loss, vvar_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    self.apply_policy_gradients = pgrad_applier.apply_gradients(
      self.local_network.get_pvars(),
      self.policy_gradients )
    self.apply_value_gradients = vgrad_applier.apply_gradients(
      self.local_network.get_vvars(),
      self.value_gradients )
    
    self.local_t = 0

    self.pinitial_learning_rate = pinitial_learning_rate
    self.vinitial_learning_rate = vinitial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0 
Example #4
Source File: a3c_training_thread.py    From a3c-distributed_tensorflow with MIT License 4 votes vote down vote up
def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,task_index=""):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      var_refs = [v._ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    if(global_network):
      self.apply_gradients = grad_applier.apply_gradients(
        global_network.get_vars(),
        self.gradients )
      self.sync = self.local_network.sync_from(global_network)
      self.mode="threading";
    else:
      self.apply_gradients = grad_applier.apply_gradients(
        self.local_network.get_vars(),
        self.gradients )
      self.mode="dist_tensor";
    if not (task_index): 
      self.game_state = GameState(113 * thread_index)
    else:
      self.game_state = GameState(113 * task_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0 
Example #5
Source File: a3c_training_thread.py    From async_deep_reinforce with Apache License 2.0 4 votes vote down vote up
def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device):

    self.thread_index = thread_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step

    if USE_LSTM:
      self.local_network = GameACLSTMNetwork(ACTION_SIZE, thread_index, device)
    else:
      self.local_network = GameACFFNetwork(ACTION_SIZE, thread_index, device)

    self.local_network.prepare_loss(ENTROPY_BETA)

    with tf.device(device):
      var_refs = [v._ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    self.apply_gradients = grad_applier.apply_gradients(
      global_network.get_vars(),
      self.gradients )
      
    self.sync = self.local_network.sync_from(global_network)
    
    self.game_state = GameState(113 * thread_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0 
Example #6
Source File: a3c_training_thread.py    From pathnet with MIT License 4 votes vote down vote up
def __init__(self,
               thread_index,
               global_network,
               initial_learning_rate,
               learning_rate_input,
               grad_applier,
               max_global_time_step,
               device,FLAGS="",task_index=""):

    self.thread_index = thread_index
    self.task_index = task_index
    self.learning_rate_input = learning_rate_input
    self.max_global_time_step = max_global_time_step
    self.limit_global_time_step = 100*10**6;
   
    if(FLAGS.use_lstm): 
      self.local_network = GameACPathNetLSTMNetwork(ACTION_SIZE, thread_index, device,FLAGS)
    else:
      self.local_network = GameACPathNetNetwork(ACTION_SIZE, thread_index, device,FLAGS)
    
    self.local_network.prepare_loss(ENTROPY_BETA)
    with tf.device(device):
      var_refs = [v._ref() for v in self.local_network.get_vars()]
      self.gradients = tf.gradients(
        self.local_network.total_loss, var_refs,
        gate_gradients=False,
        aggregation_method=None,
        colocate_gradients_with_ops=False)

    self.apply_gradients = grad_applier.apply_gradients(
      self.local_network.get_vars(),
      self.gradients )

    self.game_state = GameState(113 * task_index)
    
    self.local_t = 0

    self.initial_learning_rate = initial_learning_rate

    self.episode_reward = 0

    # variable controling log output
    self.prev_local_t = 0