Python tensorflow.python.ops.state_ops.scatter_sub() Examples
The following are 21
code examples of tensorflow.python.ops.state_ops.scatter_sub().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.python.ops.state_ops
, or try the search function
.
Example #1
Source File: optimizer.py From tensorflow-XNN with MIT License | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * ( 1.0 + alpha_t * tf.sign(grad.values) * tf.sign(m_t_slice))) # Create an op that groups multiple operations # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #2
Source File: variables.py From keras-lambda with MIT License | 6 votes |
def scatter_sub(self, sparse_delta, use_locking=False): """Subtracts `IndexedSlices` from this variable. This is essentially a shortcut for `scatter_sub(self, sparse_delta.indices, sparse_delta.values)`. Args: sparse_delta: `IndexedSlices` to be subtracted from this variable. use_locking: If `True`, use locking during the operation. Returns: A `Tensor` that will hold the new value of this variable after the scattered subtraction has completed. Raises: ValueError: if `sparse_delta` is not an `IndexedSlices`. """ if not isinstance(sparse_delta, ops.IndexedSlices): raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta) return state_ops.scatter_sub( self._variable, sparse_delta.indices, sparse_delta.values, use_locking=use_locking)
Example #3
Source File: variables.py From Serverless-Deep-Learning-with-TensorFlow-and-AWS-Lambda with MIT License | 6 votes |
def scatter_sub(self, sparse_delta, use_locking=False): """Subtracts `IndexedSlices` from this variable. This is essentially a shortcut for `scatter_sub(self, sparse_delta.indices, sparse_delta.values)`. Args: sparse_delta: `IndexedSlices` to be subtracted from this variable. use_locking: If `True`, use locking during the operation. Returns: A `Tensor` that will hold the new value of this variable after the scattered subtraction has completed. Raises: ValueError: if `sparse_delta` is not an `IndexedSlices`. """ if not isinstance(sparse_delta, ops.IndexedSlices): raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta) return state_ops.scatter_sub( self._variable, sparse_delta.indices, sparse_delta.values, use_locking=use_locking)
Example #4
Source File: optimizer.py From tensorflow-DSMM with MIT License | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * ( 1.0 + alpha_t * tf.sign(grad.values) * tf.sign(m_t_slice))) # Create an op that groups multiple operations # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #5
Source File: optimizer.py From tensorflow-DSMM with MIT License | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * tf.exp( tf.log(alpha_t) * tf.sign(grad.values) * tf.sign(m_t_slice))) # Update 'ref' by subtracting 'value # Create an op that groups multiple operations. # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #6
Source File: variables.py From deep_image_model with Apache License 2.0 | 6 votes |
def scatter_sub(self, sparse_delta, use_locking=False): """Subtracts `IndexedSlices` from this variable. This is essentially a shortcut for `scatter_sub(self, sparse_delta.indices, sparse_delta.values)`. Args: sparse_delta: `IndexedSlices` to be subtracted from this variable. use_locking: If `True`, use locking during the operation. Returns: A `Tensor` that will hold the new value of this variable after the scattered subtraction has completed. Raises: ValueError: if `sparse_delta` is not an `IndexedSlices`. """ if not isinstance(sparse_delta, ops.IndexedSlices): raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta) return state_ops.scatter_sub( self._variable, sparse_delta.indices, sparse_delta.values, use_locking=use_locking)
Example #7
Source File: optimizer.py From BERT with Apache License 2.0 | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * ( 1.0 + alpha_t * tf.sign(grad.values) * tf.sign(m_t_slice))) # Create an op that groups multiple operations # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #8
Source File: optimizer.py From BERT with Apache License 2.0 | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * tf.exp( tf.log(alpha_t) * tf.sign(grad.values) * tf.sign(m_t_slice))) # Update 'ref' by subtracting 'value # Create an op that groups multiple operations. # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #9
Source File: variables.py From auto-alt-text-lambda-api with MIT License | 6 votes |
def scatter_sub(self, sparse_delta, use_locking=False): """Subtracts `IndexedSlices` from this variable. This is essentially a shortcut for `scatter_sub(self, sparse_delta.indices, sparse_delta.values)`. Args: sparse_delta: `IndexedSlices` to be subtracted from this variable. use_locking: If `True`, use locking during the operation. Returns: A `Tensor` that will hold the new value of this variable after the scattered subtraction has completed. Raises: ValueError: if `sparse_delta` is not an `IndexedSlices`. """ if not isinstance(sparse_delta, ops.IndexedSlices): raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta) return state_ops.scatter_sub( self._variable, sparse_delta.indices, sparse_delta.values, use_locking=use_locking)
Example #10
Source File: variables.py From lambda-packs with MIT License | 6 votes |
def scatter_sub(self, sparse_delta, use_locking=False): """Subtracts `IndexedSlices` from this variable. This is essentially a shortcut for `scatter_sub(self, sparse_delta.indices, sparse_delta.values)`. Args: sparse_delta: `IndexedSlices` to be subtracted from this variable. use_locking: If `True`, use locking during the operation. Returns: A `Tensor` that will hold the new value of this variable after the scattered subtraction has completed. Raises: ValueError: if `sparse_delta` is not an `IndexedSlices`. """ if not isinstance(sparse_delta, ops.IndexedSlices): raise ValueError("sparse_delta is not IndexedSlices: %s" % sparse_delta) return state_ops.scatter_sub( self._variable, sparse_delta.indices, sparse_delta.values, use_locking=use_locking)
Example #11
Source File: optimizer.py From tensorflow-XNN with MIT License | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * tf.exp( tf.log(alpha_t) * tf.sign(grad.values) * tf.sign(m_t_slice))) # Update 'ref' by subtracting 'value # Create an op that groups multiple operations. # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #12
Source File: optimizer.py From BERT with Apache License 2.0 | 5 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) m_t_slice = tf.gather(m_t, grad.indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_prime = self.get_slot(var, "v_prime") v_t_slice = tf.gather(v_t, grad.indices) v_prime_slice = tf.gather(v_prime, grad.indices) v_t_prime = state_ops.scatter_update(v_prime, grad.indices, tf.maximum(v_prime_slice, v_t_slice)) v_t_prime_slice = array_ops.gather(v_t_prime, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
Example #13
Source File: optimizer.py From BERT with Apache License 2.0 | 5 votes |
def _apply_sparse(self, grad, var): t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) m_schedule_new = m_schedule * momentum_cache_t m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) g_prime_slice = grad.values / (1. - m_schedule_new) m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #14
Source File: nadam.py From BERT with Apache License 2.0 | 5 votes |
def _apply_sparse(self, grad, var): t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) m_schedule_new = m_schedule * momentum_cache_t m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) g_prime_slice = grad.values / (1. - m_schedule_new) m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #15
Source File: lazy_adam_optimizer.py From lambda-packs with MIT License | 5 votes |
def _apply_sparse(self, grad, var): beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m := beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1 - beta1_t) * grad.values, use_locking=self._use_locking) # v := beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1 - beta2_t) * math_ops.square(grad.values), use_locking=self._use_locking) # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t)) m_t_slice = array_ops.gather(m_t, grad.indices) v_t_slice = array_ops.gather(v_t, grad.indices) denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t var_update = state_ops.scatter_sub(var, grad.indices, lr * m_t_slice / denominator_slice, use_locking=self._use_locking) return control_flow_ops.group(var_update, m_t, v_t)
Example #16
Source File: optimizer.py From NNCF with MIT License | 5 votes |
def _apply_sparse(self, grad, var): lr = (self._lr_t * math_ops.sqrt(1 - self._beta2_power) / (1 - self._beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad.values * (1 - self._beta1_t) m_scaled = gen_array_ops.gather(m, grad.indices) * self._beta1_t m_t = state_ops.scatter_update(m, grad.indices, m_scaled + m_scaled_g_values, use_locking=self._use_locking) m_tp = gen_array_ops.gather(m_t, grad.indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad.values * grad.values) * (1 - self._beta2_t) v_scaled = gen_array_ops.gather(v, grad.indices) * self._beta2_t v_t = state_ops.scatter_update(v, grad.indices, v_scaled + v_scaled_g_values, use_locking=self._use_locking) v_tp = gen_array_ops.gather(v_t, grad.indices) v_sqrtp = math_ops.sqrt(v_tp) var_update = state_ops.scatter_sub(var, grad.indices, lr * m_tp / (v_sqrtp + self._epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #17
Source File: optimizer.py From tensorflow-XNN with MIT License | 5 votes |
def _apply_sparse(self, grad, var): t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) m_schedule_new = m_schedule * momentum_cache_t m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) g_prime_slice = grad.values / (1. - m_schedule_new) m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #18
Source File: optimizer.py From tensorflow-DSMM with MIT License | 5 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) m_t_slice = tf.gather(m_t, grad.indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_prime = self.get_slot(var, "v_prime") v_t_slice = tf.gather(v_t, grad.indices) v_prime_slice = tf.gather(v_prime, grad.indices) v_t_prime = state_ops.scatter_update(v_prime, grad.indices, tf.maximum(v_prime_slice, v_t_slice)) v_t_prime_slice = array_ops.gather(v_t_prime, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
Example #19
Source File: optimizer.py From tensorflow-DSMM with MIT License | 5 votes |
def _apply_sparse(self, grad, var): t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) m_schedule_new = m_schedule * momentum_cache_t m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) g_prime_slice = grad.values / (1. - m_schedule_new) m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #20
Source File: nadam.py From tensorflow-DSMM with MIT License | 5 votes |
def _apply_sparse(self, grad, var): t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) m_schedule_new = m_schedule * momentum_cache_t m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) g_prime_slice = grad.values / (1. - m_schedule_new) m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #21
Source File: optimizer.py From tensorflow-XNN with MIT License | 5 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) m_t_slice = tf.gather(m_t, grad.indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_prime = self.get_slot(var, "v_prime") v_t_slice = tf.gather(v_t, grad.indices) v_prime_slice = tf.gather(v_prime, grad.indices) v_t_prime = state_ops.scatter_update(v_prime, grad.indices, tf.maximum(v_prime_slice, v_t_slice)) v_t_prime_slice = array_ops.gather(v_t_prime, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])