Python keras.backend.update_add() Examples
The following are 30
code examples of keras.backend.update_add().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
keras.backend
, or try the search function
.
Example #1
Source File: optimizers.py From keras-lookahead with MIT License | 6 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = K.cast(self.iterations, K.floatx()) + 1 lr_t = self.learning_rate * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) self.updates.append(K.update_sub(p, p_t)) return self.updates
Example #2
Source File: metrics.py From keras-metrics with MIT License | 6 votes |
def __call__(self, y_true, y_pred): y_true = K.cast(K.round(y_true), "int32") y_pred = K.cast(K.round(y_pred), "int32") neg_y_pred = 1 - y_pred tp = K.sum(K.transpose(y_true * y_pred), axis=-1) fn = K.sum(K.transpose(y_true * neg_y_pred), axis=-1) current_tp = K.cast(self.tp + tp, self.epsilon.dtype) current_fn = K.cast(self.fn + fn, self.epsilon.dtype) tp_update = K.update_add(self.tp, tp) fn_update = K.update_add(self.fn, fn) self.add_update(tp_update, inputs=[y_true, y_pred]) self.add_update(fn_update, inputs=[y_true, y_pred]) return K.mean(truediv(current_tp, current_tp + current_fn + self.epsilon))
Example #3
Source File: metrics.py From keras-metrics with MIT License | 5 votes |
def __call__(self, y_true, y_pred): y_true, y_pred = self.cast(y_true, y_pred) neg_y_pred = 1 - y_pred fn = K.sum(y_true * neg_y_pred) current_fn = self.fn * 1 fn_update = K.update_add(self.fn, fn) self.add_update(fn_update, inputs=[y_true, y_pred]) return fn + current_fn
Example #4
Source File: LR_SGD.py From tripletloss-keras-tensorflow with MIT License | 5 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): matched_layer = [x for x in self.lr_multipliers.keys() if x in p.name] if matched_layer: new_lr = lr * self.lr_multipliers[matched_layer[0]] else: new_lr = lr v = self.momentum * m - new_lr * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p + self.momentum * v - new_lr * g else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
Example #5
Source File: model.py From cvpr-2018-autonomous-driving-autopilot-solution with MIT License | 5 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) accum_switch = K.equal(self.iterations % self.accum_iters, 0) print(accum_switch) accum_switch = K.cast(accum_switch, dtype='float32') # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] temp_grads = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, cg, m, tg in zip(params, grads, moments, temp_grads): g = cg + tg v = self.momentum * m - (lr * g / self.accum_iters) # velocity self.updates.append(K.update(m, (1 - accum_switch) * m + accum_switch * v)) self.updates.append(K.update(tg, (1 - accum_switch) * g)) if self.nesterov: new_p = p + self.momentum * v - (lr * g / self.accum_iters) else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, (1 - accum_switch) * p + accum_switch * new_p)) return self.updates
Example #6
Source File: model_inceptionresnet.py From cvpr-2018-autonomous-driving-autopilot-solution with MIT License | 5 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) accum_switch = K.equal(self.iterations % self.accum_iters, 0) print(accum_switch) accum_switch = K.cast(accum_switch, dtype='float32') # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] temp_grads = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, cg, m, tg in zip(params, grads, moments, temp_grads): g = cg + tg v = self.momentum * m - (lr * g / self.accum_iters) # velocity self.updates.append(K.update(m, (1 - accum_switch) * m + accum_switch * v)) self.updates.append(K.update(tg, (1 - accum_switch) * g)) if self.nesterov: new_p = p + self.momentum * v - (lr * g / self.accum_iters) else: new_p = p + v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, (1 - accum_switch) * p + accum_switch * new_p)) return self.updates
Example #7
Source File: decaying_dropout.py From DIIN-in-Keras with MIT License | 5 votes |
def call(self, inputs, training=None): noise_shape = self._get_noise_shape(inputs) t = K.cast(self.iterations, K.floatx()) + 1 p = t / float(self.decay_interval) keep_rate = self.initial_keep_rate * K.pow(self.decay_rate, p) def dropped_inputs(): self.add_update([K.update_add(self.iterations, [1])], inputs) return K.dropout(inputs, 1 - keep_rate[0], noise_shape, seed=self.seed) return K.in_train_phase(dropped_inputs, inputs, training=training)
Example #8
Source File: wide_residual_network.py From AnomalyDetectionTransformations with MIT License | 5 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): v = self.momentum * m + g # velocity self.updates.append(K.update(m, v)) if self.nesterov: new_p = p - lr * (self.momentum * v + g) else: new_p = p - lr * v # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
Example #9
Source File: adamw.py From EAST with GNU General Public License v3.0 | 5 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] wd = self.wd # decoupled weight decay (3/4) lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p # decoupled weight decay (4/4) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
Example #10
Source File: optimizers.py From keras_experiments with The Unlicense | 5 votes |
def get_updates(self, loss, params): tower_gradvars = [] gdev_list = self._gdev_list global_scope = tf.get_variable_scope() for idev, device in enumerate(gdev_list): with tf.device(device), \ tf.variable_scope(global_scope, reuse=idev > 0), \ tf.name_scope('tower_%i' % idev): grads = self.optimizer.compute_gradients(loss, params) gradvars = zip(grads, params) tower_gradvars.append(gradvars) tower_gradvars = all_avg_gradients(tower_gradvars, gdev_list, usenccl=False) self.updates = [K.update_add(self.iterations, 1)] for device_num, device in enumerate(gdev_list): with tf.device(device): gradvars = tower_gradvars[device_num] opt_update = self.optimizer.apply_gradients( grads, global_step=self.iterations) self.updates.append(opt_update) return self.updates
Example #11
Source File: lang_model_sgd.py From tying-wv-and-wc with MIT License | 5 votes |
def get_updates(self, params, constraints, loss): grads = self.get_gradients(loss, params) self.updates = [] self.updates.append(K.update_add(self.iterations, 1)) for p, g in zip(params, grads): self.updates.append((p, p - self.lr * g)) return self.updates
Example #12
Source File: metrics.py From keras-metrics with MIT License | 5 votes |
def __call__(self, y_true, y_pred): y_true, y_pred = self.cast(y_true, y_pred) neg_y_true = 1 - y_true fp = K.sum(neg_y_true * y_pred) current_fp = self.fp * 1 fp_update = K.update_add(self.fp, fp) self.add_update(fp_update, inputs=[y_true, y_pred]) return fp + current_fp
Example #13
Source File: metrics.py From keras-metrics with MIT License | 5 votes |
def __call__(self, y_true, y_pred): y_true, y_pred = self.cast(y_true, y_pred) neg_y_true = 1 - y_true neg_y_pred = 1 - y_pred tn = K.sum(neg_y_true * neg_y_pred) current_tn = self.tn * 1 tn_update = K.update_add(self.tn, tn) self.add_update(tn_update, inputs=[y_true, y_pred]) return tn + current_tn
Example #14
Source File: metrics.py From keras-metrics with MIT License | 5 votes |
def __call__(self, y_true, y_pred): y_true, y_pred = self.cast(y_true, y_pred) tp = K.sum(y_true * y_pred) current_tp = self.tp * 1 tp_update = K.update_add(self.tp, tp) self.add_update(tp_update, inputs=[y_true, y_pred]) return tp + current_tp
Example #15
Source File: yogi.py From keras-contrib with MIT License | 5 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): g2 = K.square(g) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = v - (1. - self.beta_2) * K.sign(v - g2) * g2 p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
Example #16
Source File: ftml.py From keras-contrib with MIT License | 5 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.inital_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) t = self.iterations + 1 lr_t = lr / (1. - K.pow(self.beta_1, t)) shapes = [K.int_shape(p) for p in params] zs = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] ds = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + zs + vs + ds for p, g, z, v, d in zip(params, grads, zs, vs, ds): v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g) d_t = (K.sqrt(v_t / (1. - K.pow(self.beta_2, t))) + self.epsilon) / lr_t sigma_t = d_t - self.beta_1 * d z_t = self.beta_1 * z + (1. - self.beta_1) * g - sigma_t * p p_t = - z_t / d_t self.updates.append(K.update(z, z_t)) self.updates.append(K.update(v, v_t)) self.updates.append(K.update(d, d_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
Example #17
Source File: lars.py From keras-contrib with MIT License | 5 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) weights = self.get_weights() self.updates = [K.update_add(self.iterations, 1)] scaled_lr = self.lr w_norm = K.sqrt(K.sum([K.sum(K.square(weight)) for weight in weights])) g_norm = K.sqrt(K.sum([K.sum(K.square(grad)) for grad in grads])) scaled_lr = K.switch(K.greater(w_norm * g_norm, K.zeros([1])), K.expand_dims((self.eeta * w_norm / (g_norm + self.weight_decay * w_norm + self.epsilon)) * self.lr), K.ones([1]) * self.lr) if K.backend() == 'theano': scaled_lr = scaled_lr[0] # otherwise theano raise broadcasting error # momentum moments = [K.zeros(K.int_shape(param), dtype=K.dtype(param)) for param in params] self.weights = [self.iterations] + moments for param, grad, moment in zip(params, grads, moments): v0 = (moment * self.momentum) v1 = scaled_lr * grad # velocity veloc = v0 - v1 self.updates.append(K.update(moment, veloc)) if self.nesterov: new_param = param + (veloc * self.momentum) - v1 else: new_param = param + veloc # Apply constraints. if getattr(param, 'constraint', None) is not None: new_param = param.constraint(new_param) self.updates.append(K.update(param, new_param)) return self.updates
Example #18
Source File: keras_radam.py From Keras-TextClassification with MIT License | 5 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 beta_1_t = K.pow(self.beta_1, t) beta_2_t = K.pow(self.beta_2, t) rho = 2 / (1 - self.beta_2) - 1 rho_t = rho - 2 * t * beta_2_t / (1 - beta_2_t) r_t = K.sqrt( K.relu(rho_t - 4) * K.relu(rho_t - 2) * rho / ((rho - 4) * (rho - 2) * rho_t) ) flag = K.cast(rho_t > 4, K.floatx()) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) mhat_t = m_t / (1 - beta_1_t) vhat_t = K.sqrt(v_t / (1 - beta_2_t)) p_t = p - lr * mhat_t * (flag * r_t / (vhat_t + self.epsilon) + (1 - flag)) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
Example #19
Source File: models.py From DigiX_HuaWei_Population_Age_Attribution_Predict with MIT License | 5 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /(1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) self.updates.append(K.update(vhat, vhat_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
Example #20
Source File: optimizers.py From keras-adamw with MIT License | 4 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.learning_rate if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape, name='moment_' + str(i)) for (i, shape) in enumerate(shapes)] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): # Learning rate multipliers lr_t = self.learning_rate if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t, p) v = self.momentum * m - self.eta_t * lr_t * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: p_t = p + self.momentum * v - self.eta_t * lr_t * g else: p_t = p + v # Weight decays if p.name in self.weight_decays.keys(): p_t = _apply_weight_decays(self, p, p_t) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) # Cosine annealing _update_t_cur_eta_t(self) self.lr_t = lr_t * self.eta_t # for external tracking self._init_notified = True return self.updates
Example #21
Source File: optimizers_225.py From keras-adamw with MIT License | 4 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats lr_t_premult = lr_t for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # Learning rate multipliers if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t_premult, p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - self.eta_t * lr_t * m_t / ( K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - self.eta_t * lr_t * m_t / (K.sqrt(v_t) + self.epsilon) # Weight decays if p.name in self.weight_decays.keys(): p_t = _apply_weight_decays(self, p, p_t) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) # Cosine annealing _update_t_cur_eta_t(self) self.lr_t = lr_t * self.eta_t # for external tracking self._init_notified = True return self.updates
Example #22
Source File: optimizers_225.py From keras-adamw with MIT License | 4 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] t = K.cast(self.iterations, K.floatx()) + 1 # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = self.beta_1 * (1. - 0.5 * ( K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay))) momentum_cache_t_1 = self.beta_1 * (1. - 0.5 * ( K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay))) m_schedule_new = self.m_schedule * momentum_cache_t m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 self.updates.append((self.m_schedule, m_schedule_new)) shapes = [K.int_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): # Learning rate multipliers lr_t = self.lr if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t, p) # the following equations given in [1] g_prime = g / (1. - m_schedule_new) m_t = self.beta_1 * m + (1. - self.beta_1) * g m_t_prime = m_t / (1. - m_schedule_next) v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g) v_t_prime = v_t / (1. - K.pow(self.beta_2, t)) m_t_bar = (1. - momentum_cache_t) * g_prime + ( momentum_cache_t_1 * m_t_prime) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) p_t = p - self.eta_t * lr_t * m_t_bar / ( K.sqrt(v_t_prime) + self.epsilon) # Weight decays if p.name in self.weight_decays.keys(): p_t = _apply_weight_decays(self, p, p_t) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) # Cosine annealing _update_t_cur_eta_t(self) self.lr_t = lr_t * self.eta_t # for external tracking self._init_notified = True return self.updates
Example #23
Source File: optimizers_225.py From keras-adamw with MIT License | 4 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) # momentum shapes = [K.int_shape(p) for p in params] moments = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + moments for p, g, m in zip(params, grads, moments): # Learning rate multipliers lr_t = self.lr if self.lr_multipliers is not None: lr_t = _apply_lr_multiplier(self, lr_t, p) v = self.momentum * m - self.eta_t * lr_t * g # velocity self.updates.append(K.update(m, v)) if self.nesterov: p_t = p + self.momentum * v - self.eta_t * lr_t * g else: p_t = p + v # Weight decays if p.name in self.weight_decays.keys(): p_t = _apply_weight_decays(self, p, p_t) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) # Cosine annealing _update_t_cur_eta_t(self) self.lr_t = lr_t * self.eta_t # for external tracking self._init_notified = True return self.updates
Example #24
Source File: Eve.py From DeepLearningImplementations with MIT License | 4 votes |
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.inital_decay > 0: lr *= (1. / (1. + self.decay * self.iterations)) t = self.iterations + 1 lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] f = K.variable(0) d = K.variable(1) self.weights = [self.iterations] + ms + vs + [f, d] cond = K.greater(t, K.variable(1)) small_delta_t = K.switch(K.greater(loss, f), self.small_k + 1, 1. / (self.big_K + 1)) big_delta_t = K.switch(K.greater(loss, f), self.big_K + 1, 1. / (self.small_k + 1)) c_t = K.minimum(K.maximum(small_delta_t, loss / (f + self.epsilon)), big_delta_t) f_t = c_t * f r_t = K.abs(f_t - f) / (K.minimum(f_t, f)) d_t = self.beta_3 * d + (1 - self.beta_3) * r_t f_t = K.switch(cond, f_t, loss) d_t = K.switch(cond, d_t, K.variable(1.)) self.updates.append(K.update(f, f_t)) self.updates.append(K.update(d, d_t)) for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (d_t * K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t self.updates.append(K.update(p, new_p)) return self.updates
Example #25
Source File: keras_lamb.py From keras-LAMB-Optimizer with MIT License | 4 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) m_t_hat = m_t / (1. - K.pow(self.beta_1, t)) v_t_hat = v_t / (1. - K.pow(self.beta_2, t)) p_dash = m_t_hat / (K.sqrt(v_t_hat + self.epsilon)) if self.weight_decay > 0.: wd = self.weight_decay * p p_dash = p_dash + wd r1 = K.sqrt(K.sum(K.square(p))) r2 = K.sqrt(K.sum(K.square(p_dash))) r = tf.where(tf.greater(r1, 0.), tf.where(tf.greater(r2, 0.), r1 / r2, 1.0), 1.0) # r = r1 / r2 eta = r * lr p_t = p - eta * p_dash self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
Example #26
Source File: AdamAccumulate.py From Coloring-greyscale-images with MIT License | 4 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr completed_updates = K.cast(K.tf.floor(self.iterations / self.accum_iters), K.floatx()) if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * completed_updates)) t = completed_updates + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) # self.iterations incremented after processing a batch # batch: 1 2 3 4 5 6 7 8 9 # self.iterations: 0 1 2 3 4 5 6 7 8 # update_switch = 1: x x (if accum_iters=4) update_switch = K.equal((self.iterations + 1) % self.accum_iters, 0) update_switch = K.cast(update_switch, K.floatx()) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] gs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat, tg in zip(params, grads, ms, vs, vhats, gs): sum_grad = tg + g avg_grad = sum_grad / self.accum_iters_float m_t = (self.beta_1 * m) + (1. - self.beta_1) * avg_grad v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(avg_grad) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, (1 - update_switch) * vhat + update_switch * vhat_t)) else: p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, (1 - update_switch) * m + update_switch * m_t)) self.updates.append(K.update(v, (1 - update_switch) * v + update_switch * v_t)) self.updates.append(K.update(tg, (1 - update_switch) * sum_grad)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, (1 - update_switch) * p + update_switch * new_p)) return self.updates
Example #27
Source File: optimizer.py From Anime-Super-Resolution with MIT License | 4 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.floatx()))) pass t = K.cast(self.iterations + 1, K.floatx()) lr_t = lr * K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)) shapes = [K.get_variable_shape(p) for p in params] ms = [K.zeros(shape) for shape in shapes] vs = [K.zeros(shape) for shape in shapes] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): # if a weight tensor (len > 1) use weight normalized parameterization # this is the only part changed w.r.t. keras.optimizers.Adam ps = K.get_variable_shape(p) if len(ps)>1: # get weight normalization parameters V, V_norm, V_scaler, g_param, grad_g, grad_V = get_weightnorm_params_and_grads(p, g) # Adam containers for the 'g' parameter V_scaler_shape = K.get_variable_shape(V_scaler) m_g = K.zeros(V_scaler_shape) v_g = K.zeros(V_scaler_shape) # update g parameters m_g_t = (self.beta_1 * m_g) + (1. - self.beta_1) * grad_g v_g_t = (self.beta_2 * v_g) + (1. - self.beta_2) * K.square(grad_g) new_g_param = g_param - lr_t * m_g_t / (K.sqrt(v_g_t) + self.epsilon) self.updates.append(K.update(m_g, m_g_t)) self.updates.append(K.update(v_g, v_g_t)) # update V parameters m_t = (self.beta_1 * m) + (1. - self.beta_1) * grad_V v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(grad_V) new_V_param = V - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) # if there are constraints we apply them to V, not W if getattr(p, 'constraint', None) is not None: new_V_param = p.constraint(new_V_param) pass # wn param updates --> W updates add_weightnorm_param_updates(self.updates, new_V_param, new_g_param, p, V_scaler) pass else: # do optimization normally m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # apply constraints if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) pass self.updates.append(K.update(p, new_p)) pass pass return self.updates
Example #28
Source File: adabound.py From keras-adabound with MIT License | 4 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 # Applies bounds on actual learning rate step_size = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) final_lr = self.final_lr * lr / self.base_lr lower_bound = final_lr * (1. - 1. / (self.gamma * t + 1.)) upper_bound = final_lr * (1. + 1. / (self.gamma * t)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsbound: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # apply weight decay if self.weight_decay != 0.: g += self.weight_decay * K.stop_gradient(p) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsbound: vhat_t = K.maximum(vhat, v_t) denom = (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: denom = (K.sqrt(v_t) + self.epsilon) # Compute the bounds step_size_p = step_size * K.ones_like(denom) step_size_p_bound = step_size_p / denom bounded_lr_t = m_t * K.minimum(K.maximum(step_size_p_bound, lower_bound), upper_bound) p_t = p - bounded_lr_t self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
Example #29
Source File: optimization.py From BERT_with_keras with MIT License | 4 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = tf.train.polynomial_decay( self.lr, self.iterations, self.num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False ) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. t = K.cast(self.iterations, K.floatx()) + 1 warmup_percent_done = K.cast(t / self.num_warmup_steps, dtype=K.floatx()) warmup_lr = self.lr * warmup_percent_done is_warmup = K.cast(t < self.num_warmup_steps, dtype=K.floatx()) lr = ((1.0 - is_warmup) * lr) + is_warmup * warmup_lr ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] self.weights = [self.iterations] + ms + vs for p, g, m, v in zip(params, grads, ms, vs): m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.bias_corrected: m_t /= 1 - K.pow(self.beta_1, t) v_t /= 1 - K.pow(self.beta_2, t) update = m_t / (K.sqrt(v_t) + self.epsilon) # Just adding the square of the weights to the loss function is *not* # the correct way of using L2 regularization/weight decay with Adam, # since that will interact with the m and v parameters in strange ways. # # Instead we want ot decay the weights in a manner that doesn't interact # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. param_name = self._get_variable_name(p.name) if self._do_use_weight_decay(param_name): update += self.weight_decay_rate * p p_t = p - lr * update self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates
Example #30
Source File: adamlr.py From StyleGAN-Keras with MIT License | 4 votes |
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr *= (1. / (1. + self.decay * K.cast(self.iterations, K.dtype(self.decay)))) t = K.cast(self.iterations, K.floatx()) + 1 lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t))) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # Learning rate multipliers if self.multipliers: multiplier = [mult for mult in self.multipliers if mult in p.name] else: multiplier = None if multiplier: new_lr_t = lr_t * self.multipliers[multiplier[0]] if self.debug_verbose: print('Setting {} to learning rate {}'.format(multiplier[0], new_lr_t)) print(K.get_value(new_lr_t)) else: new_lr_t = lr_t if self.debug_verbose: print('No change in learning rate {}'.format(p.name)) print(K.get_value(new_lr_t)) m_t = (self.beta_1 * m) + (1. - self.beta_1) * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) if self.amsgrad: vhat_t = K.maximum(vhat, v_t) p_t = p - new_lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) self.updates.append(K.update(vhat, vhat_t)) else: p_t = p - new_lr_t * m_t / (K.sqrt(v_t) + self.epsilon) self.updates.append(K.update(m, m_t)) self.updates.append(K.update(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(K.update(p, new_p)) return self.updates