Python theano.tensor.sqrt() Examples
The following are 30
code examples of theano.tensor.sqrt().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
theano.tensor
, or try the search function
.
Example #1
Source File: model2.py From Projects with MIT License | 6 votes |
def adam(self, cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8): ''' adam gradient descent updates ''' updates = [] grads = T.grad(cost, params) self.i = theano.shared(np.float32(0.)) i_t = self.i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): self.m = theano.shared(p.get_value() * 0.) self.v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * self.m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * self.v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((self.m, m_t)) updates.append((self.v, v_t)) updates.append((p, p_t)) updates.append((self.i, i_t)) return updates #load saved lstm if it exists, else initialize new lstm
Example #2
Source File: optimization.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def rmsprop(self, cost, params, lr=0.001, rho=0.9, eps=1e-6,consider_constant=None): """ RMSProp. """ lr = theano.shared(np.float32(lr).astype(floatX)) gradients = self.get_gradients(cost, params,consider_constant) accumulators = [theano.shared(np.zeros_like(p.get_value()).astype(np.float32)) for p in params] updates = [] for param, gradient, accumulator in zip(params, gradients, accumulators): new_accumulator = rho * accumulator + (1 - rho) * gradient ** 2 updates.append((accumulator, new_accumulator)) new_param = param - lr * gradient / T.sqrt(new_accumulator + eps) updates.append((param, new_param)) return updates
Example #3
Source File: adam.py From gated-graph-transformer-network with MIT License | 6 votes |
def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = [] grads = T.grad(cost, params) i = theano.shared(np.array(0., theano.config.floatX)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates
Example #4
Source File: optimization.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def adadelta(self, cost, params, rho=0.95, epsilon=1e-6,consider_constant=None): """ Adadelta. Based on: http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf """ rho = theano.shared(np.float32(rho).astype(floatX)) epsilon = theano.shared(np.float32(epsilon).astype(floatX)) gradients = self.get_gradients(cost, params,consider_constant) accu_gradients = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] accu_deltas = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] updates = [] for param, gradient, accu_gradient, accu_delta in zip(params, gradients, accu_gradients, accu_deltas): new_accu_gradient = rho * accu_gradient + (1. - rho) * gradient ** 2. delta_x = - T.sqrt((accu_delta + epsilon) / (new_accu_gradient + epsilon)) * gradient new_accu_delta = rho * accu_delta + (1. - rho) * delta_x ** 2. updates.append((accu_gradient, new_accu_gradient)) updates.append((accu_delta, new_accu_delta)) updates.append((param, param + delta_x)) return updates
Example #5
Source File: blocks.py From spinn with MIT License | 6 votes |
def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6, grads=None): # From: # https://github.com/Newmu/Theano-Tutorials/blob/master/4_modern_net.py if grads is None: grads = T.grad(cost=cost, wrt=params) assert len(grads) == len(params) updates = [] for p, g in zip(params, grads): acc = theano.shared(np.zeros_like(p.get_value(), dtype=np.float32), name="%s/rms/acc" % p.name) acc_new = rho * acc + (1 - rho) * g ** 2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - lr * g)) return updates
Example #6
Source File: hgru4rec.py From hgru4rec with MIT License | 6 votes |
def adam(self, param, grad, updates, sample_idx=None, epsilon=1e-6): v1 = np.float32(self.decay) v2 = np.float32(1.0 - self.decay) acc = theano.shared(param.get_value(borrow=False) * 0., borrow=True) meang = theano.shared(param.get_value(borrow=False) * 0., borrow=True) countt = theano.shared(param.get_value(borrow=False) * 0., borrow=True) if sample_idx is None: acc_new = v1 * acc + v2 * grad ** 2 meang_new = v1 * meang + v2 * grad countt_new = countt + 1 updates[acc] = acc_new updates[meang] = meang_new updates[countt] = countt_new else: acc_s = acc[sample_idx] meang_s = meang[sample_idx] countt_s = countt[sample_idx] acc_new = v1 * acc_s + v2 * grad ** 2 meang_new = v1 * meang_s + v2 * grad countt_new = countt_s + 1.0 updates[acc] = T.set_subtensor(acc_s, acc_new) updates[meang] = T.set_subtensor(meang_s, meang_new) updates[countt] = T.set_subtensor(countt_s, countt_new) return (meang_new / (1 - v1 ** countt_new)) / (T.sqrt(acc_new / (1 - v1 ** countt_new)) + epsilon)
Example #7
Source File: __init__.py From adversarial with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_noise(self, size): # Allow just requesting batch size if isinstance(size, int): size = (size, self.get_input_space().get_total_dimension()) if not hasattr(self, 'noise'): self.noise = "gaussian" if self.noise == "uniform": return self.theano_rng.uniform(low=-np.sqrt(3), high=np.sqrt(3), size=size, dtype='float32') elif self.noise == "gaussian": return self.theano_rng.normal(size=size, dtype='float32') elif self.noise == "spherical": noise = self.theano_rng.normal(size=size, dtype='float32') noise = noise / T.maximum(1e-7, T.sqrt(T.sqr(noise).sum(axis=1))).dimshuffle(0, 'x') return noise else: raise NotImplementedError(self.noise)
Example #8
Source File: deconv.py From adversarial with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _modify_updates(self, updates): """ Replaces the values in `updates` if needed to enforce the options set in the __init__ method, including `max_kernel_norm`. Parameters ---------- updates : OrderedDict A dictionary mapping parameters (including parameters not belonging to this model) to updated values of those parameters. The dictionary passed in contains the updates proposed by the learning algorithm. This function modifies the dictionary directly. The modified version will be compiled and executed by the learning algorithm. """ if self.max_kernel_norm is not None: W, = self.transformer.get_params() if W in updates: updated_W = updates[W] row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=(0, 1, 2))) desired_norms = T.clip(row_norms, 0, self.max_kernel_norm) scales = desired_norms / (1e-7 + row_norms) updates[W] = (updated_W * scales.dimshuffle('x', 'x', 'x', 0))
Example #9
Source File: nn.py From opt-mmd with BSD 3-Clause "New" or "Revised" License | 6 votes |
def adam_updates(params, cost, lr=0.001, mom1=0.9, mom2=0.999): updates = [] grads = T.grad(cost, params) t = th.shared(np.cast[th.config.floatX](1.)) for p, g in zip(params, grads): v = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) mg = th.shared(np.cast[th.config.floatX](p.get_value() * 0.)) v_t = mom1*v + (1. - mom1)*g mg_t = mom2*mg + (1. - mom2)*T.square(g) v_hat = v_t / (1. - mom1 ** t) mg_hat = mg_t / (1. - mom2 ** t) g_t = v_hat / T.sqrt(mg_hat + 1e-8) p_t = p - lr * g_t updates.append((v, v_t)) updates.append((mg, mg_t)) updates.append((p, p_t)) updates.append((t, t+1)) return updates
Example #10
Source File: nn.py From opt-mmd with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_output_for(self, input, deterministic=False, **kwargs): if deterministic: norm_features = (input-self.avg_batch_mean.dimshuffle(*self.dimshuffle_args)) / T.sqrt(1e-6 + self.avg_batch_var).dimshuffle(*self.dimshuffle_args) else: batch_mean = T.mean(input,axis=self.axes_to_sum).flatten() centered_input = input-batch_mean.dimshuffle(*self.dimshuffle_args) batch_var = T.mean(T.square(centered_input),axis=self.axes_to_sum).flatten() batch_stdv = T.sqrt(1e-6 + batch_var) norm_features = centered_input / batch_stdv.dimshuffle(*self.dimshuffle_args) # BN updates new_m = 0.9*self.avg_batch_mean + 0.1*batch_mean new_v = 0.9*self.avg_batch_var + T.cast((0.1*input.shape[0])/(input.shape[0]-1),th.config.floatX)*batch_var self.bn_updates = [(self.avg_batch_mean, new_m), (self.avg_batch_var, new_v)] if hasattr(self, 'g'): activation = norm_features*self.g.dimshuffle(*self.dimshuffle_args) else: activation = norm_features if hasattr(self, 'b'): activation += self.b.dimshuffle(*self.dimshuffle_args) return self.nonlinearity(activation)
Example #11
Source File: nn.py From opt-mmd with BSD 3-Clause "New" or "Revised" License | 6 votes |
def get_output_for(self, input, init=False, deterministic=False, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) activation = T.dot(input, self.W) if init: ma = T.mean(activation, axis=0) activation -= ma.dimshuffle('x',0) stdv = T.sqrt(T.mean(T.square(activation),axis=0)) activation /= stdv.dimshuffle('x',0) self.init_updates = [(self.weight_scale, self.weight_scale/stdv), (self.b, -ma/stdv)] else: activation += self.b.dimshuffle('x', 0) return self.nonlinearity(activation)
Example #12
Source File: updates.py From iGAN with MIT License | 6 votes |
def __call__(self, params, cost): updates = [] grads = T.grad(cost, params) grads = clip_norms(grads, self.clipnorm) t = theano.shared(floatX(1.)) b1_t = self.b1 * self.l**(t - 1) for p, g in zip(params, grads): g = self.regularizer.gradient_regularize(p, g) m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = b1_t * m + (1 - b1_t) * g v_t = self.b2 * v + (1 - self.b2) * g**2 m_c = m_t / (1 - self.b1**t) v_c = v_t / (1 - self.b2**t) p_t = p - (self.lr * m_c) / (T.sqrt(v_c) + self.e) p_t = self.regularizer.weight_regularize(p_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((t, t + 1.)) return updates
Example #13
Source File: updates.py From iGAN with MIT License | 6 votes |
def __call__(self, params, cost): updates = [] grads = T.grad(cost, params) grads = clip_norms(grads, self.clipnorm) for p, g in zip(params, grads): g = self.regularizer.gradient_regularize(p, g) acc = theano.shared(p.get_value() * 0.) acc_delta = theano.shared(p.get_value() * 0.) acc_new = self.rho * acc + (1 - self.rho) * g ** 2 updates.append((acc, acc_new)) update = g * T.sqrt(acc_delta + self.epsilon) / T.sqrt(acc_new + self.epsilon) updated_p = p - self.lr * update updated_p = self.regularizer.weight_regularize(updated_p) updates.append((p, updated_p)) acc_delta_new = self.rho * acc_delta + (1 - self.rho) * update ** 2 updates.append((acc_delta, acc_delta_new)) return updates
Example #14
Source File: mujoco_costs.py From adversarial-policies with MIT License | 6 votes |
def __init__(self): def f(x, u, i, terminal): if terminal: ctrl_cost = T.zeros_like(x[..., 0]) else: ctrl_cost = T.square(u).sum(axis=-1) # x: (batch_size, 8) # x[..., 0:4]: qpos # x[..., 4:8]: qvel, time derivatives of qpos, not used in the cost. theta = x[..., 0] # qpos[0]: angle of joint 0 phi = x[..., 1] # qpos[1]: angle of joint 1 target_xpos = x[..., 2:4] # qpos[2:4], target x & y coordinate body1_xpos = 0.1 * T.stack([T.cos(theta), T.sin(theta)], axis=1) tip_xpos_incr = 0.11 * T.stack([T.cos(phi), T.sin(phi)], axis=1) tip_xpos = body1_xpos + tip_xpos_incr delta = tip_xpos - target_xpos state_cost = T.sqrt(T.sum(delta * delta, axis=-1)) cost = state_cost + ctrl_cost return cost super().__init__(f, state_size=8, action_size=2)
Example #15
Source File: nmt.py From nmt with BSD 3-Clause "New" or "Revised" License | 6 votes |
def adadelta(lr, tparams, grads, inp, cost): running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] rg2_new = [0.95 * rg2 + 0.05 * (g ** 2) for rg2, g in zip(running_grads2, grads)] rg2up = [(rg2, r_n) for rg2, r_n in zip(running_grads2, rg2_new)] updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(grads, running_up2, rg2_new)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] inp += [lr] f_update = theano.function(inp, cost, updates=rg2up+ru2up+param_up, on_unused_input='ignore', profile=profile) return f_update
Example #16
Source File: mmd.py From opt-mmd with BSD 3-Clause "New" or "Revised" License | 6 votes |
def rbf_mmd2_streaming_and_ratio(X, Y, sigma=0): # n = (T.smallest(X.shape[0], Y.shape[0]) // 2) * 2 n = (X.shape[0] // 2) * 2 gamma = 1 / (2 * sigma**2) rbf = lambda A, B: T.exp(-gamma * ((A - B) ** 2).sum(axis=1)) h_bits = (rbf(X[:n:2], X[1:n:2]) + rbf(Y[:n:2], Y[1:n:2]) - rbf(X[:n:2], Y[1:n:2]) - rbf(X[1:n:2], Y[:n:2])) mmd2 = h_bits.mean() # variance is 1/2 E_{v, v'} (h(v) - h(v'))^2 # estimate with even, odd diffs m = (n // 2) * 2 approx_var = 1/2 * ((h_bits[:m:2] - h_bits[1:m:2]) ** 2).mean() ratio = mmd2 / T.sqrt(T.largest(approx_var, _eps)) return mmd2, ratio ################################################################################ ### MMD with linear kernel # Hotelling test statistic is from: # Jitkrittum, Szabo, Chwialkowski, and Gretton. # Interpretable Distribution Features with Maximum Testing Power. # NIPS 2015.
Example #17
Source File: nmt.py From nmt with BSD 3-Clause "New" or "Revised" License | 6 votes |
def debugging_adadelta(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2'%k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, profile=profile) updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
Example #18
Source File: nmt.py From nmt with BSD 3-Clause "New" or "Revised" License | 6 votes |
def rmsprop(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad'%k) for k, p in tparams.iteritems()] running_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad'%k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2'%k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, profile=profile) updir = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_updir'%k) for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(itemlist(tparams), updir_new)] f_update = theano.function([lr], [], updates=updir_new+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update
Example #19
Source File: conv_net.py From Projects with MIT License | 6 votes |
def adam(self, cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8): updates = [] grads = T.grad(cost, params) self.i = theano.shared(np.float32(0.)) i_t = self.i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): self.m = theano.shared(p.get_value() * 0.) self.v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * self.m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * self.v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((self.m, m_t)) updates.append((self.v, v_t)) updates.append((p, p_t)) updates.append((self.i, i_t)) return updates
Example #20
Source File: optimization.py From Att-ChemdNER with Apache License 2.0 | 6 votes |
def adagrad(self, cost, params, lr=1.0, epsilon=1e-6,consider_constant=None): """ Adagrad. Based on http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf """ lr = theano.shared(np.float32(lr).astype(floatX)) epsilon = theano.shared(np.float32(epsilon).astype(floatX)) gradients = self.get_gradients(cost, params,consider_constant) gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] updates = [] for param, gradient, gsum in zip(params, gradients, gsums): new_gsum = gsum + gradient ** 2. updates.append((gsum, new_gsum)) updates.append((param, param - lr * gradient / (T.sqrt(gsum + epsilon)))) return updates
Example #21
Source File: model1.py From Projects with MIT License | 6 votes |
def adam(self, cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8): ''' adam gradient descent updates ''' updates = [] grads = T.grad(cost, params) self.i = theano.shared(np.float32(0.)) i_t = self.i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): self.m = theano.shared(p.get_value() * 0.) self.v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * self.m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * self.v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((self.m, m_t)) updates.append((self.v, v_t)) updates.append((p, p_t)) updates.append((self.i, i_t)) return updates #open previous lowest training cost if it exists
Example #22
Source File: updates.py From iGAN with MIT License | 5 votes |
def __call__(self, params, cost): updates = [] grads = T.grad(cost, params) grads = clip_norms(grads, self.clipnorm) for p, g in zip(params, grads): g = self.regularizer.gradient_regularize(p, g) acc = theano.shared(p.get_value() * 0.) acc_t = acc + g ** 2 updates.append((acc, acc_t)) p_t = p - (self.lr / T.sqrt(acc_t + self.epsilon)) * g p_t = self.regularizer.weight_regularize(p_t) updates.append((p, p_t)) return updates
Example #23
Source File: updates.py From iGAN with MIT License | 5 votes |
def max_norm(self, p, maxnorm): if maxnorm > 0: norms = T.sqrt(T.sum(T.sqr(p), axis=0)) desired = T.clip(norms, 0, maxnorm) p = p * (desired / (1e-7 + norms)) return p
Example #24
Source File: blocks.py From spinn with MIT License | 5 votes |
def HeKaimingInitializer(): def HeKaimingInit(shape, real_shape=None): # Calculate fan-in / fan-out using real shape if given as override fan = real_shape or shape return np.random.normal(scale=np.sqrt(4.0/(fan[0] + fan[1])), size=shape) return HeKaimingInit
Example #25
Source File: 4_modern_net.py From Theano-Tutorials with MIT License | 5 votes |
def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6): grads = T.grad(cost=cost, wrt=params) updates = [] for p, g in zip(params, grads): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g ** 2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - lr * g)) return updates
Example #26
Source File: 5_convolutional_net.py From Theano-Tutorials with MIT License | 5 votes |
def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6): grads = T.grad(cost=cost, wrt=params) updates = [] for p, g in zip(params, grads): acc = theano.shared(p.get_value() * 0.) acc_new = rho * acc + (1 - rho) * g ** 2 gradient_scaling = T.sqrt(acc_new + epsilon) g = g / gradient_scaling updates.append((acc, acc_new)) updates.append((p, p - lr * g)) return updates
Example #27
Source File: constraints.py From CAPTCHA-breaking with MIT License | 5 votes |
def __call__(self, p): norms = T.sqrt(T.sum(T.sqr(p), axis=0)) desired = T.clip(norms, 0, self.m) p = p * (desired / (1e-7 + norms)) return p
Example #28
Source File: constraints.py From CAPTCHA-breaking with MIT License | 5 votes |
def __call__(self, p): return p / T.sqrt(T.sum(p**2, axis=-1, keepdims=True))
Example #29
Source File: optimizers.py From CAPTCHA-breaking with MIT License | 5 votes |
def get_gradients(self, loss, params): grads = T.grad(loss, params) if hasattr(self, 'clipnorm') and self.clipnorm > 0: norm = T.sqrt(sum([T.sum(g ** 2) for g in grads])) grads = [clip_norm(g, self.clipnorm, norm) for g in grads] return grads
Example #30
Source File: optimizers.py From DL4MT with BSD 3-Clause "New" or "Revised" License | 5 votes |
def adadelta(lr, tparams, grads, inp, cost, profile=False): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, profile=profile) updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update