Python theano.shared() Examples
The following are 30
code examples of theano.shared().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
theano
, or try the search function
.
Example #1
Source File: optimization.py From Att-ChemdNER with Apache License 2.0 | 7 votes |
def sgd(self, cost, params,constraints={}, lr=0.01): #{{{ """ Stochatic gradient descent. """ updates = [] lr = theano.shared(np.float32(lr).astype(floatX)) gradients = self.get_gradients(cost, params) for p, g in zip(params, gradients): v=-lr*g; new_p=p+v; # apply constraints if p in constraints: c=constraints[p]; new_p=c(new_p); updates.append((p, new_p)) return updates #}}}
Example #2
Source File: optimization.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def adadelta(self, cost, params, rho=0.95, epsilon=1e-6,consider_constant=None): """ Adadelta. Based on: http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf """ rho = theano.shared(np.float32(rho).astype(floatX)) epsilon = theano.shared(np.float32(epsilon).astype(floatX)) gradients = self.get_gradients(cost, params,consider_constant) accu_gradients = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] accu_deltas = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] updates = [] for param, gradient, accu_gradient, accu_delta in zip(params, gradients, accu_gradients, accu_deltas): new_accu_gradient = rho * accu_gradient + (1. - rho) * gradient ** 2. delta_x = - T.sqrt((accu_delta + epsilon) / (new_accu_gradient + epsilon)) * gradient new_accu_delta = rho * accu_delta + (1. - rho) * delta_x ** 2. updates.append((accu_gradient, new_accu_gradient)) updates.append((accu_delta, new_accu_delta)) updates.append((param, param + delta_x)) return updates
Example #3
Source File: net.py From Depth-Map-Prediction with GNU General Public License v3.0 | 6 votes |
def _init_params(self, init_W, tie_params): (nfilt, fc, fi, fj) = self.filter_shape if 'W' not in tie_params: if init_W is None: w_shape = self.filter_shape init_W = self.conf.geteval('init_W')(w_shape).astype(floatX) self.W = theano.shared(value=init_W, name='W') self.params.append(self.W) if self.have_bias and 'b' not in tie_params: init_b = self.conf.geteval('init_b', 0) nb = nfilt if not self.transpose else fc self.b = theano.shared(init_b + np.zeros(nb, dtype=floatX), name='b') self.params.append(self.b) #计算网络的输出
Example #4
Source File: optimization.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def sgdmomentum(self, cost, params,constraints={}, lr=0.01,consider_constant=None, momentum=0.): """ Stochatic gradient descent with momentum. Momentum has to be in [0, 1) """ # Check that the momentum is a correct value assert 0 <= momentum < 1 lr = theano.shared(np.float32(lr).astype(floatX)) momentum = theano.shared(np.float32(momentum).astype(floatX)) gradients = self.get_gradients(cost, params) velocities = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] updates = [] for param, gradient, velocity in zip(params, gradients, velocities): new_velocity = momentum * velocity - lr * gradient updates.append((velocity, new_velocity)) new_p=param+new_velocity; # apply constraints if param in constraints: c=constraints[param]; new_p=c(new_p); updates.append((param, new_p)) return updates
Example #5
Source File: net.py From Depth-Map-Prediction with GNU General Public License v3.0 | 6 votes |
def _init_params(self, init_W, tie_params): if 'W' not in tie_params: if init_W is None: w_shape = (self.ninput, self.noutput) init_W = self.conf.geteval('init_W')(w_shape).astype(floatX) self.W = theano.shared(value=init_W, name='W') self.params.append(self.W) if self.have_bias and 'b' not in tie_params: nbias = self.noutput if not self.transpose else self.ninput init_b = self.conf.geteval('init_b', 0) init_b = self.conf.geteval('init_bias', init_b) self.bias = theano.shared(init_b + np.zeros(nbias, dtype=floatX), name='bias') self.params.append(self.bias) #网络输出计算
Example #6
Source File: optimization.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def rmsprop(self, cost, params, lr=0.001, rho=0.9, eps=1e-6,consider_constant=None): """ RMSProp. """ lr = theano.shared(np.float32(lr).astype(floatX)) gradients = self.get_gradients(cost, params,consider_constant) accumulators = [theano.shared(np.zeros_like(p.get_value()).astype(np.float32)) for p in params] updates = [] for param, gradient, accumulator in zip(params, gradients, accumulators): new_accumulator = rho * accumulator + (1 - rho) * gradient ** 2 updates.append((accumulator, new_accumulator)) new_param = param - lr * gradient / T.sqrt(new_accumulator + eps) updates.append((param, new_param)) return updates
Example #7
Source File: base_gru.py From gated-graph-transformer-network with MIT License | 6 votes |
def __init__(self, input_width, output_width, activation_shift=0.0, name=None, dropout_keep=1, dropout_input=False, dropout_output=True): """ Params: input_width: Width of input output_width: Width of the GRU output activation_shift: How to shift the biases of the activation """ self._input_width = input_width self._output_width = output_width prefix = "" if name is None else name + "_" self._reset_W = theano.shared(init_params([input_width + output_width, output_width]), prefix+"reset_W") self._reset_b = theano.shared(init_params([output_width], shift=1.0), prefix+"reset_b") self._update_W = theano.shared(init_params([input_width + output_width, output_width]), prefix+"update_W") self._update_b = theano.shared(init_params([output_width], shift=1.0), prefix+"update_b") self._activation_W = theano.shared(init_params([input_width + output_width, output_width]), prefix+"activation_W") self._activation_b = theano.shared(init_params([output_width], shift=activation_shift), prefix+"activation_b") self._dropout_keep = dropout_keep self._dropout_input = dropout_input self._dropout_output = dropout_output
Example #8
Source File: strength_weighted_gru.py From gated-graph-transformer-network with MIT License | 6 votes |
def __init__(self, input_width, output_width, activation_shift=0.0, name=None): """ Params: input_width: Width of input. output_width: Width of the GRU output activation_shift: How to shift the biases of the activation """ self._input_width = input_width self._output_width = output_width prefix = "" if name is None else name + "_" self._reset_W = theano.shared(init_params([input_width + output_width, output_width]), prefix+"reset_W") self._reset_b = theano.shared(init_params([output_width], shift=1.0), prefix+"reset_b") self._update_W = theano.shared(init_params([input_width + output_width, output_width+1]), prefix+"update_W") self._update_b = theano.shared(init_params([output_width+1], shift=1.0), prefix+"update_b") self._activation_W = theano.shared(init_params([input_width + output_width, output_width]), prefix+"activation_W") self._activation_b = theano.shared(init_params([output_width], shift=activation_shift), prefix+"activation_b") self._strength_W = theano.shared(init_params([input_width + output_width, 1]), prefix+"strength_W") self._strength_b = theano.shared(init_params([1], shift=1.0), prefix+"strength_b")
Example #9
Source File: optimization.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def adagrad(self, cost, params, lr=1.0, epsilon=1e-6,consider_constant=None): """ Adagrad. Based on http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf """ lr = theano.shared(np.float32(lr).astype(floatX)) epsilon = theano.shared(np.float32(epsilon).astype(floatX)) gradients = self.get_gradients(cost, params,consider_constant) gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] updates = [] for param, gradient, gsum in zip(params, gradients, gsums): new_gsum = gsum + gradient ** 2. updates.append((gsum, new_gsum)) updates.append((param, param - lr * gradient / (T.sqrt(gsum + epsilon)))) return updates
Example #10
Source File: adam.py From gated-graph-transformer-network with MIT License | 6 votes |
def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = [] grads = T.grad(cost, params) i = theano.shared(np.array(0., theano.config.floatX)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates
Example #11
Source File: nmt.py From nmt with BSD 3-Clause "New" or "Revised" License | 6 votes |
def rmsprop(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] running_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad'%k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, profile=profile) updir = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_updir'%k) for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(itemlist(tparams), updir_new)] f_update = theano.function([lr], [], updates=updir_new+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
Example #12
Source File: theano_backend.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def variable(value, dtype=None, name=None): '''Instantiates a variable and returns it. # Arguments value: Numpy array, initial value of the tensor. dtype: Tensor type. name: Optional name string for the tensor. # Returns A variable instance (with Keras metadata included). ''' if dtype is None: dtype = floatx() if hasattr(value, 'tocoo'): _assert_sparse_module() variable = th_sparse_module.as_sparse_variable(value) else: value = np.asarray(value, dtype=dtype) variable = theano.shared(value=value, name=name, strict=False) variable._keras_shape = value.shape variable._uses_learning_phase = False return variable
Example #13
Source File: blocks.py From spinn with MIT License | 6 votes |
def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6, grads=None): # From: # https://github.com/Newmu/Theano-Tutorials/blob/master/4_modern_net.py if grads is None: grads = T.grad(cost=cost, wrt=params) assert len(grads) == len(params) updates = [] for p, g in zip(params, grads): acc = theano.shared(np.zeros_like(p.get_value(), dtype=np.float32), name="%s/rms/acc" % p.name) acc_new = rho * acc + (1 - rho) * g ** 2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - lr * g)) return updates
Example #14
Source File: hgru4rec.py From hgru4rec with MIT License | 6 votes |
def adam(self, param, grad, updates, sample_idx=None, epsilon=1e-6): v1 = np.float32(self.decay) v2 = np.float32(1.0 - self.decay) acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True) meang = theano.shared(param.get_value(borrow=False) * 0., borrow=True) countt = theano.shared(param.get_value(borrow=False) * 0., borrow=True) if sample_idx is None: acc_new = v1 * acc + v2 * grad ** 2 meang_new = v1 * meang + v2 * grad countt_new = countt + 1 updates[acc] = acc_new updates[meang] = meang_new updates[countt] = countt_new else: acc_s = acc[sample_idx] meang_s = meang[sample_idx] countt_s = countt[sample_idx] acc_new = v1 * acc_s + v2 * grad ** 2 meang_new = v1 * meang_s + v2 * grad countt_new = countt_s + 1.0 updates[acc] = T.set_subtensor(acc_s, acc_new) updates[meang] = T.set_subtensor(meang_s, meang_new) updates[countt] = T.set_subtensor(countt_s, countt_new) return (meang_new / (1 - v1 ** countt_new)) / (T.sqrt(acc_new / (1 - v1 ** countt_new)) + epsilon)
Example #15
Source File: nmt.py From nmt with BSD 3-Clause "New" or "Revised" License | 6 votes |
def debugging_adadelta(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, profile=profile) updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
Example #16
Source File: test_recurrent.py From CAPTCHA-breaking with MIT License | 6 votes |
def _runner(layer_class): """ All the recurrent layers share the same interface, so we can run through them with a single function. """ for weights in [None, [np.ones((input_dim, output_dim))]]: for ret_seq in [True, False]: layer = layer_class(input_dim, output_dim, return_sequences=ret_seq, weights=weights) layer.input = theano.shared(value=np.ones((nb_samples, timesteps, input_dim))) config = layer.get_config() for train in [True, False]: out = layer.get_output(train).eval() # Make sure the output has the desired shape if ret_seq: assert(out.shape == (nb_samples, timesteps, output_dim)) else: assert(out.shape == (nb_samples, output_dim)) mask = layer.get_output_mask(train)
Example #17
Source File: test_core.py From CAPTCHA-breaking with MIT License | 6 votes |
def test_connections(self): nb_samples = 10 input_dim = 5 layer1 = core.Layer() layer2 = core.Layer() input = np.ones((nb_samples, input_dim)) layer1.input = theano.shared(value=input) # As long as there is no previous layer, an error should be raised. for train in [True, False]: self.assertRaises(AttributeError, layer2.get_input, train) # After connecting, input of layer1 should be passed through layer2.set_previous(layer1) for train in [True, False]: assert_allclose(layer2.get_input(train).eval(), input) assert_allclose(layer2.get_output(train).eval(), input)
Example #18
Source File: layers.py From DL4MT with BSD 3-Clause "New" or "Revised" License | 6 votes |
def shared_dropout_layer(shape, use_noise, trng, value, scaled=True): #re-scale dropout at training time, so we don't need to at test time if scaled: proj = tensor.switch( use_noise, trng.binomial(shape, p=value, n=1, dtype='float32')/value, theano.shared(numpy.float32(1.))) else: proj = tensor.switch( use_noise, trng.binomial(shape, p=value, n=1, dtype='float32'), theano.shared(numpy.float32(value))) return proj # feedforward layer: affine transformation + point-wise nonlinearity
Example #19
Source File: updates.py From iGAN with MIT License | 6 votes |
def __call__(self, params, cost): updates = [] grads = T.grad(cost, params) grads = clip_norms(grads, self.clipnorm) for p, g in zip(params, grads): g = self.regularizer.gradient_regularize(p, g) acc = theano.shared(p.get_value() * 0.) acc_delta = theano.shared(p.get_value() * 0.) acc_new = self.rho * acc + (1 - self.rho) * g ** 2 updates.append((acc, acc_new)) update = g * T.sqrt(acc_delta + self.epsilon) / T.sqrt(acc_new + self.epsilon) updated_p = p - self.lr * update updated_p = self.regularizer.weight_regularize(updated_p) updates.append((p, updated_p)) acc_delta_new = self.rho * acc_delta + (1 - self.rho) * update ** 2 updates.append((acc_delta, acc_delta_new)) return updates
Example #20
Source File: updates.py From iGAN with MIT License | 6 votes |
def __call__(self, params, cost): updates = [] grads = T.grad(cost, params) grads = clip_norms(grads, self.clipnorm) t = theano.shared(floatX(1.)) b1_t = self.b1 * self.l**(t - 1) for p, g in zip(params, grads): g = self.regularizer.gradient_regularize(p, g) m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = b1_t * m + (1 - b1_t) * g v_t = self.b2 * v + (1 - self.b2) * g**2 m_c = m_t / (1 - self.b1**t) v_c = v_t / (1 - self.b2**t) p_t = p - (self.lr * m_c) / (T.sqrt(v_c) + self.e) p_t = self.regularizer.weight_regularize(p_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((t, t + 1.)) return updates
Example #21
Source File: nn.py From opt-mmd with BSD 3-Clause "New" or "Revised" License | 6 votes |
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): updates = [] grads = T.grad(cost, params) t = th.shared(np.cast[th.config.floatX](1.)) for p, g in zip(params, grads): v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v_t = mom1*v + (1. - mom1)*g mg_t = mom2*mg + (1. - mom2)*T.square(g) v_hat = v_t / (1. - mom1 ** t) mg_hat = mg_t / (1. - mom2 ** t) g_t = v_hat / T.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append((v, v_t)) updates.append((mg, mg_t)) updates.append((p, p_t)) updates.append((t, t+1)) return updates
Example #22
Source File: solver.py From 3D-R2N2 with MIT License | 6 votes |
def SGD(lr, params, grads, loss): """ Stochastic Gradient Descent w/ momentum """ momentum = cfg.TRAIN.MOMENTUM w_decay = cfg.TRAIN.WEIGHT_DECAY updates = [] for param, grad in zip(params, grads): vel = theano.shared(param.val.get_value() * 0.) if param.is_bias or w_decay == 0: regularized_grad = grad else: regularized_grad = grad + w_decay * param.val param_additive = momentum * vel - lr * regularized_grad updates.append((vel, param_additive)) updates.append((param.val, param.val + param_additive)) return updates
Example #23
Source File: hgru4rec.py From hgru4rec with MIT License | 6 votes |
def adadelta(self, param, grad, updates, sample_idx=None, epsilon=1e-6): v1 = np.float32(self.decay) v2 = np.float32(1.0 - self.decay) acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True) upd = theano.shared(param.get_value(borrow=False) * 0., borrow=True) if sample_idx is None: acc_new = acc + grad ** 2 updates[acc] = acc_new grad = T.sqrt(upd + epsilon) * grad upd_new = v1 * upd + v2 * grad ** 2 updates[upd] = upd_new else: acc_s = acc[sample_idx] acc_new = acc_s + grad ** 2 updates[acc] = T.set_subtensor(acc_s, acc_new) upd_s = upd[sample_idx] upd_new = v1 * upd_s + v2 * grad ** 2 updates[upd] = T.set_subtensor(upd_s, upd_new) grad = T.sqrt(upd_s + epsilon) * grad gradient_scaling = T.cast(T.sqrt(acc_new + epsilon), theano.config.floatX) return grad / gradient_scaling
Example #24
Source File: variable_store.py From spinn with MIT License | 6 votes |
def add_param(self, name, shape, initializer=None, savable=True, trainable=True): if not initializer: initializer = self.default_initializer if name not in self.vars: full_name = "%s/%s" % (self.prefix, name) if self.logger: self.logger.Log( "Created variable " + full_name + " shape: " + str(shape), level=self.logger.DEBUG) init_value = initializer(shape).astype(theano.config.floatX) self.vars[name] = theano.shared(init_value, name=full_name) if savable: self.savable_vars[name] = self.vars[name] if trainable: self.trainable_vars[name] = self.vars[name] return self.vars[name]
Example #25
Source File: skipthoughts.py From StackGAN with MIT License | 5 votes |
def init_tparams(params): """ initialize Theano shared variables according to the initial parameters """ tparams = OrderedDict() for kk, pp in params.iteritems(): tparams[kk] = theano.shared(params[kk], name=kk) return tparams
Example #26
Source File: 5_convolutional_net.py From Theano-Tutorials with MIT License | 5 votes |
def init_weights(shape): return theano.shared(floatX(np.random.randn(*shape) * 0.01))
Example #27
Source File: 5_convolutional_net.py From Theano-Tutorials with MIT License | 5 votes |
def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6): grads = T.grad(cost=cost, wrt=params) updates = [] for p, g in zip(params, grads): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g ** 2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - lr * g)) return updates
Example #28
Source File: optimizers.py From DL4MT with BSD 3-Clause "New" or "Revised" License | 5 votes |
def adadelta(lr, tparams, grads, inp, cost, profile=False): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, profile=profile) updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
Example #29
Source File: 4_modern_net.py From Theano-Tutorials with MIT License | 5 votes |
def init_weights(shape): return theano.shared(floatX(np.random.randn(*shape) * 0.01))
Example #30
Source File: 4_modern_net.py From Theano-Tutorials with MIT License | 5 votes |
def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6): grads = T.grad(cost=cost, wrt=params) updates = [] for p, g in zip(params, grads): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g ** 2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - lr * g)) return updates