Python tensorflow.python.ops.state_ops.scatter_update() Examples
The following are 28
code examples of tensorflow.python.ops.state_ops.scatter_update().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorflow.python.ops.state_ops
, or try the search function
.
Example #1
Source File: optimizer.py From tensorflow-XNN with MIT License | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * ( 1.0 + alpha_t * tf.sign(grad.values) * tf.sign(m_t_slice))) # Create an op that groups multiple operations # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #2
Source File: topn.py From keras-lambda with MIT License | 6 votes |
def remove(self, ids): """Remove the ids (and their associated scores) from the TopN.""" with ops.control_dependencies(self.last_ops): scatter_op = state_ops.scatter_update( self.id_to_score, ids, array_ops.ones_like( ids, dtype=dtypes.float32) * dtypes.float32.min) # We assume that removed ids are almost always in the shortlist, # so it makes no sense to hide the Op behind a tf.cond shortlist_ids_to_remove, new_length = tensor_forest_ops.top_n_remove( self.sl_ids, ids) u1 = state_ops.scatter_update( self.sl_ids, array_ops.concat([[0], shortlist_ids_to_remove], 0), array_ops.concat( [new_length, array_ops.ones_like(shortlist_ids_to_remove) * -1], 0)) u2 = state_ops.scatter_update( self.sl_scores, shortlist_ids_to_remove, dtypes.float32.min * array_ops.ones_like( shortlist_ids_to_remove, dtype=dtypes.float32)) self.last_ops = [scatter_op, u1, u2]
Example #3
Source File: topn.py From keras-lambda with MIT License | 6 votes |
def insert(self, ids, scores): """Insert the ids and scores into the TopN.""" with ops.control_dependencies(self.last_ops): scatter_op = state_ops.scatter_update(self.id_to_score, ids, scores) larger_scores = math_ops.greater(scores, self.sl_scores[0]) def shortlist_insert(): larger_ids = array_ops.boolean_mask( math_ops.to_int64(ids), larger_scores) larger_score_values = array_ops.boolean_mask(scores, larger_scores) shortlist_ids, new_ids, new_scores = tensor_forest_ops.top_n_insert( self.sl_ids, self.sl_scores, larger_ids, larger_score_values) u1 = state_ops.scatter_update(self.sl_ids, shortlist_ids, new_ids) u2 = state_ops.scatter_update(self.sl_scores, shortlist_ids, new_scores) return control_flow_ops.group(u1, u2) # We only need to insert into the shortlist if there are any # scores larger than the threshold. cond_op = control_flow_ops.cond( math_ops.reduce_any(larger_scores), shortlist_insert, control_flow_ops.no_op) with ops.control_dependencies([cond_op]): self.last_ops = [scatter_op, cond_op]
Example #4
Source File: factorization_ops.py From keras-lambda with MIT License | 6 votes |
def scatter_update(cls, factor, indices, values, sharding_func): """Helper function for doing sharded scatter update.""" assert isinstance(factor, list) if len(factor) == 1: with ops.colocate_with(factor[0]): # TODO(agarwal): assign instead of scatter update for full batch update. return state_ops.scatter_update(factor[0], indices, values).op else: num_shards = len(factor) assignments, new_ids = sharding_func(indices) assert assignments is not None assignments = math_ops.cast(assignments, dtypes.int32) sharded_ids = data_flow_ops.dynamic_partition(new_ids, assignments, num_shards) sharded_values = data_flow_ops.dynamic_partition(values, assignments, num_shards) updates = [] for i in xrange(num_shards): updates.append( state_ops.scatter_update(factor[i], sharded_ids[i], sharded_values[ i])) return control_flow_ops.group(*updates)
Example #5
Source File: optimizer.py From tensorflow-DSMM with MIT License | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * ( 1.0 + alpha_t * tf.sign(grad.values) * tf.sign(m_t_slice))) # Create an op that groups multiple operations # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #6
Source File: optimizer.py From tensorflow-DSMM with MIT License | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * tf.exp( tf.log(alpha_t) * tf.sign(grad.values) * tf.sign(m_t_slice))) # Update 'ref' by subtracting 'value # Create an op that groups multiple operations. # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #7
Source File: optimizer.py From BERT with Apache License 2.0 | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * ( 1.0 + alpha_t * tf.sign(grad.values) * tf.sign(m_t_slice))) # Create an op that groups multiple operations # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #8
Source File: optimizer.py From BERT with Apache License 2.0 | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * tf.exp( tf.log(alpha_t) * tf.sign(grad.values) * tf.sign(m_t_slice))) # Update 'ref' by subtracting 'value # Create an op that groups multiple operations. # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #9
Source File: topn.py From auto-alt-text-lambda-api with MIT License | 6 votes |
def remove(self, ids): """Remove the ids (and their associated scores) from the TopN.""" with ops.control_dependencies(self.last_ops): scatter_op = state_ops.scatter_update( self.id_to_score, ids, array_ops.ones_like( ids, dtype=dtypes.float32) * dtypes.float32.min) # We assume that removed ids are almost always in the shortlist, # so it makes no sense to hide the Op behind a tf.cond shortlist_ids_to_remove, new_length = tensor_forest_ops.top_n_remove( self.sl_ids, ids) u1 = state_ops.scatter_update( self.sl_ids, array_ops.concat([[0], shortlist_ids_to_remove], 0), array_ops.concat( [new_length, array_ops.ones_like(shortlist_ids_to_remove) * -1], 0)) u2 = state_ops.scatter_update( self.sl_scores, shortlist_ids_to_remove, dtypes.float32.min * array_ops.ones_like( shortlist_ids_to_remove, dtype=dtypes.float32)) self.last_ops = [scatter_op, u1, u2]
Example #10
Source File: optimizer.py From tensorflow-XNN with MIT License | 6 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype) beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype) eps = 1e-7 # cap for moving average m = self.get_slot(var, "m") m_slice = tf.gather(m, grad.indices) m_t = state_ops.scatter_update(m, grad.indices, tf.maximum(beta_t * m_slice + eps, tf.abs(grad.values))) m_t_slice = tf.gather(m_t, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * grad.values * tf.exp( tf.log(alpha_t) * tf.sign(grad.values) * tf.sign(m_t_slice))) # Update 'ref' by subtracting 'value # Create an op that groups multiple operations. # When this op finishes, all ops in input have finished return control_flow_ops.group(*[var_update, m_t])
Example #11
Source File: factorization_ops.py From auto-alt-text-lambda-api with MIT License | 6 votes |
def scatter_update(cls, factor, indices, values, sharding_func): """Helper function for doing sharded scatter update.""" assert isinstance(factor, list) if len(factor) == 1: with ops.colocate_with(factor[0]): # TODO(agarwal): assign instead of scatter update for full batch update. return state_ops.scatter_update(factor[0], indices, values).op else: num_shards = len(factor) assignments, new_ids = sharding_func(indices) assert assignments is not None assignments = math_ops.cast(assignments, dtypes.int32) sharded_ids = data_flow_ops.dynamic_partition(new_ids, assignments, num_shards) sharded_values = data_flow_ops.dynamic_partition(values, assignments, num_shards) updates = [] for i in xrange(num_shards): updates.append( state_ops.scatter_update(factor[i], sharded_ids[i], sharded_values[ i])) return control_flow_ops.group(*updates)
Example #12
Source File: topn.py From auto-alt-text-lambda-api with MIT License | 6 votes |
def insert(self, ids, scores): """Insert the ids and scores into the TopN.""" with ops.control_dependencies(self.last_ops): scatter_op = state_ops.scatter_update(self.id_to_score, ids, scores) larger_scores = math_ops.greater(scores, self.sl_scores[0]) def shortlist_insert(): larger_ids = array_ops.boolean_mask( math_ops.to_int64(ids), larger_scores) larger_score_values = array_ops.boolean_mask(scores, larger_scores) shortlist_ids, new_ids, new_scores = tensor_forest_ops.top_n_insert( self.sl_ids, self.sl_scores, larger_ids, larger_score_values) u1 = state_ops.scatter_update(self.sl_ids, shortlist_ids, new_ids) u2 = state_ops.scatter_update(self.sl_scores, shortlist_ids, new_scores) return control_flow_ops.group(u1, u2) # We only need to insert into the shortlist if there are any # scores larger than the threshold. cond_op = control_flow_ops.cond( math_ops.reduce_any(larger_scores), shortlist_insert, control_flow_ops.no_op) with ops.control_dependencies([cond_op]): self.last_ops = [scatter_op, cond_op]
Example #13
Source File: factorization_ops.py From lambda-packs with MIT License | 6 votes |
def scatter_update(cls, factor, indices, values, sharding_func, name=None): """Helper function for doing sharded scatter update.""" assert isinstance(factor, list) if len(factor) == 1: with ops.colocate_with(factor[0]): # TODO(agarwal): assign instead of scatter update for full batch update. return state_ops.scatter_update(factor[0], indices, values, name=name).op else: num_shards = len(factor) assignments, new_ids = sharding_func(indices) assert assignments is not None assignments = math_ops.cast(assignments, dtypes.int32) sharded_ids = data_flow_ops.dynamic_partition(new_ids, assignments, num_shards) sharded_values = data_flow_ops.dynamic_partition(values, assignments, num_shards) updates = [] for i in xrange(num_shards): updates.append(state_ops.scatter_update(factor[i], sharded_ids[i], sharded_values[i])) return control_flow_ops.group(*updates, name=name)
Example #14
Source File: tensor_forest.py From lambda-packs with MIT License | 5 votes |
def tree_initialization(self): def _init_tree(): return state_ops.scatter_update(self.variables.tree, [0], [[-1, -1]]).op def _nothing(): return control_flow_ops.no_op() return control_flow_ops.cond( math_ops.equal( array_ops.squeeze( array_ops.strided_slice(self.variables.tree, [0, 0], [1, 1])), -2), _init_tree, _nothing)
Example #15
Source File: optimizer.py From tensorflow-XNN with MIT License | 5 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) m_t_slice = tf.gather(m_t, grad.indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_prime = self.get_slot(var, "v_prime") v_t_slice = tf.gather(v_t, grad.indices) v_prime_slice = tf.gather(v_prime, grad.indices) v_t_prime = state_ops.scatter_update(v_prime, grad.indices, tf.maximum(v_prime_slice, v_t_slice)) v_t_prime_slice = array_ops.gather(v_t_prime, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
Example #16
Source File: tensor_forest.py From keras-lambda with MIT License | 5 votes |
def tree_initialization(self): def _init_tree(): return state_ops.scatter_update(self.variables.tree, [0], [[-1, -1]]).op def _nothing(): return control_flow_ops.no_op() return control_flow_ops.cond( math_ops.equal( array_ops.squeeze( array_ops.strided_slice(self.variables.tree, [0, 0], [1, 1])), -2), _init_tree, _nothing)
Example #17
Source File: optimizer.py From tensorflow-XNN with MIT License | 5 votes |
def _apply_sparse(self, grad, var): t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) m_schedule_new = m_schedule * momentum_cache_t m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) g_prime_slice = grad.values / (1. - m_schedule_new) m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #18
Source File: tensor_forest.py From deep-learning with MIT License | 5 votes |
def tree_initialization(self): def _init_tree(): return state_ops.scatter_update(self.variables.tree, [0], [[-1, -1]]).op def _nothing(): return control_flow_ops.no_op() return control_flow_ops.cond( math_ops.equal(array_ops.squeeze(array_ops.slice( self.variables.tree, [0, 0], [1, 1])), -2), _init_tree, _nothing)
Example #19
Source File: nadam.py From tensorflow-DSMM with MIT License | 5 votes |
def _apply_sparse(self, grad, var): t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) m_schedule_new = m_schedule * momentum_cache_t m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) g_prime_slice = grad.values / (1. - m_schedule_new) m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #20
Source File: optimizer.py From tensorflow-DSMM with MIT License | 5 votes |
def _apply_sparse(self, grad, var): t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) m_schedule_new = m_schedule * momentum_cache_t m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) g_prime_slice = grad.values / (1. - m_schedule_new) m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #21
Source File: optimizer.py From tensorflow-DSMM with MIT License | 5 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) m_t_slice = tf.gather(m_t, grad.indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_prime = self.get_slot(var, "v_prime") v_t_slice = tf.gather(v_t, grad.indices) v_prime_slice = tf.gather(v_prime, grad.indices) v_t_prime = state_ops.scatter_update(v_prime, grad.indices, tf.maximum(v_prime_slice, v_t_slice)) v_t_prime_slice = array_ops.gather(v_t_prime, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
Example #22
Source File: optimizer.py From NNCF with MIT License | 5 votes |
def _apply_sparse(self, grad, var): lr = (self._lr_t * math_ops.sqrt(1 - self._beta2_power) / (1 - self._beta1_power)) # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_scaled_g_values = grad.values * (1 - self._beta1_t) m_scaled = gen_array_ops.gather(m, grad.indices) * self._beta1_t m_t = state_ops.scatter_update(m, grad.indices, m_scaled + m_scaled_g_values, use_locking=self._use_locking) m_tp = gen_array_ops.gather(m_t, grad.indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_scaled_g_values = (grad.values * grad.values) * (1 - self._beta2_t) v_scaled = gen_array_ops.gather(v, grad.indices) * self._beta2_t v_t = state_ops.scatter_update(v, grad.indices, v_scaled + v_scaled_g_values, use_locking=self._use_locking) v_tp = gen_array_ops.gather(v_t, grad.indices) v_sqrtp = math_ops.sqrt(v_tp) var_update = state_ops.scatter_sub(var, grad.indices, lr * m_tp / (v_sqrtp + self._epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #23
Source File: tensor_forest.py From deep_image_model with Apache License 2.0 | 5 votes |
def tree_initialization(self): def _init_tree(): return state_ops.scatter_update(self.variables.tree, [0], [[-1, -1]]).op def _nothing(): return control_flow_ops.no_op() return control_flow_ops.cond( math_ops.equal(array_ops.squeeze(array_ops.slice( self.variables.tree, [0, 0], [1, 1])), -2), _init_tree, _nothing)
Example #24
Source File: nadam.py From BERT with Apache License 2.0 | 5 votes |
def _apply_sparse(self, grad, var): t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) m_schedule_new = m_schedule * momentum_cache_t m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) g_prime_slice = grad.values / (1. - m_schedule_new) m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #25
Source File: optimizer.py From BERT with Apache License 2.0 | 5 votes |
def _apply_sparse(self, grad, var): t = math_ops.cast(self._iterations, var.dtype.base_dtype) + 1. m_schedule = math_ops.cast(self._m_schedule, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) schedule_decay_t = math_ops.cast(self._schedule_decay_t, var.dtype.base_dtype) # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_power = self._get_momentum_cache(schedule_decay_t, t) momentum_cache_t = beta1_t * (1. - 0.5 * momentum_cache_power) momentum_cache_t_1 = beta1_t * (1. - 0.5 * momentum_cache_power * self._momentum_cache_const) m_schedule_new = m_schedule * momentum_cache_t m_schedule_next = m_schedule_new * momentum_cache_t_1 # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) g_prime_slice = grad.values / (1. - m_schedule_new) m_t_prime_slice = array_ops.gather(m_t, grad.indices) / (1. - m_schedule_next) m_t_bar_slice = (1. - momentum_cache_t) * g_prime_slice + momentum_cache_t_1 * m_t_prime_slice # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_t_prime_slice = array_ops.gather(v_t, grad.indices) / (1. - tf.pow(beta2_t, t)) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_bar_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t])
Example #26
Source File: optimizer.py From BERT with Apache License 2.0 | 5 votes |
def _apply_sparse(self, grad, var): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) # the following equations given in [1] # m_t = beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1. - beta1_t) * grad.values, use_locking=self._use_locking) m_t_slice = tf.gather(m_t, grad.indices) # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1. - beta2_t) * tf.square(grad.values), use_locking=self._use_locking) v_prime = self.get_slot(var, "v_prime") v_t_slice = tf.gather(v_t, grad.indices) v_prime_slice = tf.gather(v_prime, grad.indices) v_t_prime = state_ops.scatter_update(v_prime, grad.indices, tf.maximum(v_prime_slice, v_t_slice)) v_t_prime_slice = array_ops.gather(v_t_prime, grad.indices) var_update = state_ops.scatter_sub(var, grad.indices, lr_t * m_t_slice / (math_ops.sqrt(v_t_prime_slice) + epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
Example #27
Source File: lazy_adam_optimizer.py From lambda-packs with MIT License | 5 votes |
def _apply_sparse(self, grad, var): beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype) beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype) lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) # m := beta1 * m + (1 - beta1) * g_t m = self.get_slot(var, "m") m_t = state_ops.scatter_update(m, grad.indices, beta1_t * array_ops.gather(m, grad.indices) + (1 - beta1_t) * grad.values, use_locking=self._use_locking) # v := beta2 * v + (1 - beta2) * (g_t * g_t) v = self.get_slot(var, "v") v_t = state_ops.scatter_update(v, grad.indices, beta2_t * array_ops.gather(v, grad.indices) + (1 - beta2_t) * math_ops.square(grad.values), use_locking=self._use_locking) # variable -= learning_rate * m_t / (epsilon_t + sqrt(v_t)) m_t_slice = array_ops.gather(m_t, grad.indices) v_t_slice = array_ops.gather(v_t, grad.indices) denominator_slice = math_ops.sqrt(v_t_slice) + epsilon_t var_update = state_ops.scatter_sub(var, grad.indices, lr * m_t_slice / denominator_slice, use_locking=self._use_locking) return control_flow_ops.group(var_update, m_t, v_t)
Example #28
Source File: tensor_forest.py From auto-alt-text-lambda-api with MIT License | 5 votes |
def tree_initialization(self): def _init_tree(): return state_ops.scatter_update(self.variables.tree, [0], [[-1, -1]]).op def _nothing(): return control_flow_ops.no_op() return control_flow_ops.cond( math_ops.equal( array_ops.squeeze( array_ops.strided_slice(self.variables.tree, [0, 0], [1, 1])), -2), _init_tree, _nothing)