Python theano.tensor.switch() Examples
The following are 30
code examples of theano.tensor.switch().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
theano.tensor
, or try the search function
.
Example #1
Source File: layers.py From DL4MT with BSD 3-Clause "New" or "Revised" License | 6 votes |
def shared_dropout_layer(shape, use_noise, trng, value, scaled=True): #re-scale dropout at training time, so we don't need to at test time if scaled: proj = tensor.switch( use_noise, trng.binomial(shape, p=value, n=1, dtype='float32')/value, theano.shared(numpy.float32(1.))) else: proj = tensor.switch( use_noise, trng.binomial(shape, p=value, n=1, dtype='float32'), theano.shared(numpy.float32(value))) return proj # feedforward layer: affine transformation + point-wise nonlinearity
Example #2
Source File: util.py From gated-graph-transformer-network with MIT License | 6 votes |
def reduce_log_sum(tensor, axis=None, guaranteed_finite=False): """ Sum probabilities in the log domain, i.e return log(e^vec[0] + e^vec[1] + ...) = log(e^x e^(vec[0]-x) + e^x e^(vec[1]-x) + ...) = log(e^x [e^(vec[0]-x) + e^(vec[1]-x) + ...]) = log(e^x) + log(e^(vec[0]-x) + e^(vec[1]-x) + ...) = x + log(e^(vec[0]-x) + e^(vec[1]-x) + ...) For numerical stability, we choose x = max(vec) Note that if x is -inf, that means all values are -inf, so the answer should be -inf. In this case, choose x = 0 """ maxval = T.max(tensor, axis) maxval_full = T.max(tensor, axis, keepdims=True) if not guaranteed_finite: maxval = T.switch(T.isfinite(maxval), maxval, T.zeros_like(maxval)) maxval_full = T.switch(T.isfinite(maxval_full), maxval_full, T.zeros_like(maxval_full)) reduced_sum = T.sum(T.exp(tensor - maxval_full), axis) logsum = maxval + T.log(reduced_sum) return logsum
Example #3
Source File: optim.py From iaf with MIT License | 6 votes |
def AdaMax(w, objective, alpha=.01, beta1=.1, beta2=.001): print 'AdaMax', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2 g = T.grad(objective.sum(), w, disconnected_inputs='warn') new = OrderedDict() for i in range(len(w)): #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's mom1 = G.sharedf(w[i].get_value() * 0.) _max = G.sharedf(w[i].get_value() * 0.) new[mom1] = (1-beta1) * mom1 + beta1 * g[i] new[_max] = T.maximum((1-beta2)*_max, abs(g[i]) + 1e-8) new[w[i]] = w[i] + alpha * new[mom1] / new[_max] return new # AdaMax that averages over multiple minibatches
Example #4
Source File: theano_backend.py From GraphicDesignPatternByPython with MIT License | 6 votes |
def switch(condition, then_expression, else_expression): """Switches between two operations depending on a scalar value. Note that both `then_expression` and `else_expression` should be symbolic tensors of the *same shape*. # Arguments condition: scalar tensor (`int` or `bool`). then_expression: either a tensor, or a callable that returns a tensor. else_expression: either a tensor, or a callable that returns a tensor. # Returns The selected tensor. """ if callable(then_expression): then_expression = then_expression() if callable(else_expression): else_expression = else_expression() cond_ndim = ndim(condition) expr_ndim = ndim(then_expression) if cond_ndim < expr_ndim: ndim_diff = expr_ndim - cond_ndim for _ in range(ndim_diff): condition = expand_dims(condition) return T.switch(condition, then_expression, else_expression)
Example #5
Source File: base.py From carl with BSD 3-Clause "New" or "Revised" License | 6 votes |
def bound(expression, out, *predicates): """Bound a theano expression. Parameters ---------- * `expression` [theano expression]: The expression to bound. * `out` [theano expression]: The out-of-bounds value. * `*predicates` [list of theano expressions]: The list of predicates defining the boundaries of `expression`. Returns ------- * `value` [theano expression]: The bounded expression. """ guard = 1 for p in predicates: guard *= p return T.switch(guard, expression, out)
Example #6
Source File: keras_extensions.py From visual_turing_test-tutorial with MIT License | 6 votes |
def time_distributed_nonzero_max_pooling(x): """ Computes maximum along the first (time) dimension. It ignores the mask m. In: x - input; a 3D tensor mask_value - value to mask out, if None then no masking; by default 0.0, """ import theano.tensor as T mask_value=0.0 x = T.switch(T.eq(x, mask_value), -numpy.inf, x) masked_max_x = x.max(axis=1) # replace infinities with mask_value masked_max_x = T.switch(T.eq(masked_max_x, -numpy.inf), 0, masked_max_x) return masked_max_x
Example #7
Source File: keras_extensions.py From visual_turing_test-tutorial with MIT License | 6 votes |
def time_distributed_masked_max(x, m): """ Computes max along the first (time) dimension. In: x - input; a 3D tensor m - mask m_value - value for masking """ # place infinities where mask is off m_value = 0.0 tmp = K.switch(K.equal(m, 0.0), -numpy.inf, 0.0) x_with_inf = x + K.expand_dims(tmp) x_max = K.max(x_with_inf, axis=1) r = K.switch(K.equal(x_max, -numpy.inf), m_value, x_max) return r ## classes ## # Transforms existing layers to masked layers
Example #8
Source File: toolbox.py From Theano-Lights with MIT License | 6 votes |
def sgdmgc(cost, params, lr=1.0, alpha=0.1, max_magnitude=5.0, infDecay=0.1): """SGD with momentum and gradient clipping""" grads = T.grad(cost=cost, wrt=params) updates = [] norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) for p, g in zip(params, grads): v = shared(p.get_value() * 0.) g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) v_new = v * (1.0 - alpha) - alpha * lr * g updates.append((v, v_new)) updates.append((p, p + v_new )) return updates, norm
Example #9
Source File: __init__.py From adversarial with BSD 3-Clause "New" or "Revised" License | 6 votes |
def __init__(self, scale_grads=1, target_scale=.1, discriminator_default_input_include_prob = 1., discriminator_input_include_probs=None, discriminator_default_input_scale=1., discriminator_input_scales=None, generator_default_input_include_prob = 1., generator_default_input_scale=1., inference_default_input_include_prob=None, inference_input_include_probs=None, inference_default_input_scale=1., inference_input_scales=None, init_now_train_generator=True, ever_train_discriminator=True, ever_train_generator=True, ever_train_inference=True, no_drop_in_d_for_g=False, alternate_g = False): self.__dict__.update(locals()) del self.self # These allow you to dynamically switch off training parts. # If the corresponding ever_train_* is False, these have # no effect. self.now_train_generator = sharedX(init_now_train_generator) self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32')) self.now_train_inference = sharedX(numpy.array(1., dtype='float32'))
Example #10
Source File: PoolLayer.py From NSC with MIT License | 5 votes |
def __init__(self, input, rate, istrain): rate = numpy.float32(rate) self.input = input srng = T.shared_randomstreams.RandomStreams() mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32') self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate)) self.params = []
Example #11
Source File: ff_layers.py From LV_groundhog with BSD 3-Clause "New" or "Revised" License | 5 votes |
def fprop(self, all_states): if self.ntimes: stateshape0 = all_states.shape[0] shape0 = TT.switch(TT.gt(self.n, 0), self.n, all_states.shape[0]) single_frame = TT.shape_padleft(all_states[stateshape0-1]) mask = TT.alloc(numpy.float32(1), shape0, *[1 for k in xrange(all_states.ndim-1)]) rval = single_frame * mask self.out = rval return rval single_frame = all_states[all_states.shape[0]-1] self.out = single_frame return single_frame
Example #12
Source File: gru4rec.py From sars_tutorial with MIT License | 5 votes |
def execute(self, X): return self.lmbd * T.switch(T.ge(X, 0), X, self.alpha * (T.exp(X) - 1))
Example #13
Source File: gru4rec.py From sars_tutorial with MIT License | 5 votes |
def execute(self, X): return T.switch(T.ge(X, 0), X, self.alpha * (T.exp(X) - 1))
Example #14
Source File: exponential.py From carl with BSD 3-Clause "New" or "Revised" License | 5 votes |
def __init__(self, inverse_scale=1.0): """Constructor. Parameters ---------- * `inverse_scale` [float]: The inverse scale. """ super(Exponential, self).__init__(inverse_scale=inverse_scale) # pdf self.pdf_ = T.switch( T.lt(self.X, 0.), 0., self.inverse_scale * T.exp(-self.inverse_scale * self.X)).ravel() self._make(self.pdf_, "pdf") # -log pdf self.nll_ = bound( -T.log(self.inverse_scale) + self.inverse_scale * self.X, np.inf, self.inverse_scale > 0.).ravel() self._make(self.nll_, "nll") # cdf self.cdf_ = (1. - T.exp(-self.inverse_scale * self.X)).ravel() self._make(self.cdf_, "cdf") # ppf self.ppf_ = -T.log(1. - self.p) / self.inverse_scale self._make(self.ppf_, "ppf", args=[self.p])
Example #15
Source File: gram.py From gram with BSD 3-Clause "New" or "Revised" License | 5 votes |
def dropout_layer(state_before, use_noise, trng, prob): proj = T.switch(use_noise, (state_before * trng.binomial(state_before.shape, p=prob, n=1, dtype=state_before.dtype)), state_before * 0.5) return proj
Example #16
Source File: helpers.py From deep-prior with GNU General Public License v3.0 | 5 votes |
def TruncLin(x): """ Truncated linear unit :param x: input value :return: max(min(x,1),-1) """ import theano.tensor as T return T.switch(x < -1, -1, T.switch(x > 1, 1, x))
Example #17
Source File: PoolLayer.py From NSC with MIT License | 5 votes |
def __init__(self, input, rate, istrain): rate = numpy.float32(rate) self.input = input srng = T.shared_randomstreams.RandomStreams() mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32') self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate)) self.params = []
Example #18
Source File: PoolLayer.py From NSC with MIT License | 5 votes |
def __init__(self, input, rate, istrain): rate = numpy.float32(rate) self.input = input srng = T.shared_randomstreams.RandomStreams() mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32') self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate)) self.params = []
Example #19
Source File: helpers.py From deep-prior with GNU General Public License v3.0 | 5 votes |
def InvReLU(x): """ Rectified linear unit :param x: input value :return: max(x,0) """ import theano.tensor as T x *= -1. return T.switch(x < 0, 0, x)
Example #20
Source File: simple.py From attention-lvcsr with MIT License | 5 votes |
def apply(self, input_): return tensor.switch(input_ > 0, input_, 0)
Example #21
Source File: toolbox.py From Theano-Lights with MIT License | 5 votes |
def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) i = shared(floatX(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.) v = shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm #--------------------------------------------------------------------------------------------------
Example #22
Source File: toolbox.py From Theano-Lights with MIT License | 5 votes |
def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1): updates = [] grads = T.grad(cost, params) norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) i = shared(floatX(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) m = shared(p.get_value() * 0.) v = shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) #e_t = shared(p.get_value() * 0.) #de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05 #*p_t #p_t = p_t + de_t #updates.append((e_t, e_t + de_t)) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates, norm
Example #23
Source File: toolbox.py From Theano-Lights with MIT License | 5 votes |
def sgdgc(cost, params, lr=1.0, max_magnitude=5.0, infDecay=0.1): """SGD with gradient clipping""" grads = T.grad(cost=cost, wrt=params) updates = [] norm = norm_gs(params, grads) sqrtnorm = T.sqrt(norm) #not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm)) adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.) for p, g in zip(params, grads): #g = T.switch(not_finite, infDecay * p, g * adj_norm_gs) updates.append((p, p - lr * g * adj_norm_gs)) return updates, norm
Example #24
Source File: regression.py From Diffusion-Probabilistic-Models with MIT License | 5 votes |
def apply(self, input_): return T.switch(input_ > 0, input_, 0.05*input_)
Example #25
Source File: session_encdec.py From hred-qs with BSD 3-Clause "New" or "Revised" License | 5 votes |
def compute_updates(self, training_cost, params): updates = [] grads = T.grad(training_cost, params) grads = OrderedDict(zip(params, grads)) # Clip stuff c = numpy.float32(self.cutoff) clip_grads = [] norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items())) normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.)) notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs)) for p, g in grads.items(): clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization))) grads = OrderedDict(clip_grads) if self.updater == 'adagrad': updates = Adagrad(grads, self.lr) elif self.updater == 'sgd': raise Exception("Sgd not implemented!") elif self.updater == 'adadelta': updates = Adadelta(grads) elif self.updater == 'rmsprop': updates = RMSProp(grads, self.lr) elif self.updater == 'adam': updates = Adam(grads) else: raise Exception("Updater not understood!") return updates
Example #26
Source File: updates.py From dcgan_code with MIT License | 5 votes |
def clip_norm(g, c, n): if c > 0: g = T.switch(T.ge(n, c), g*c/n, g) return g
Example #27
Source File: optim.py From iaf with MIT License | 5 votes |
def AdaMaxAvg2(ws, objective, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, n_accum=1): if n_accum == 1: return AdaMaxAvg(ws, objective, alpha, beta1, beta2, beta3) print 'AdaMax_Avg2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2,'beta3:',beta3,'n_accum:',n_accum gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='raise') new = OrderedDict() from theano.ifelse import ifelse it = G.sharedf(0.) new[it] = it + 1 reset = T.eq(T.mod(it,n_accum), 0) update = T.eq(T.mod(it,n_accum), n_accum-1) ws_avg = [] for j in range(len(ws)): w_avg = {} for i in ws[j]: _w = ws[j][i] _g = gs[j][i] #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's mom1 = G.sharedf(_w.get_value() * 0.) _max = G.sharedf(_w.get_value() * 0.) w_avg[i] = G.sharedf(_w.get_value()) g_sum = G.sharedf(_w.get_value() * 0.) new[g_sum] = ifelse(reset, _g, g_sum + _g) new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1) new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max) new[_w] = ifelse(update, _w + alpha * new[mom1] / new[_max], _w) new[w_avg[i]] = ifelse(update, beta3 * new[_w] + (1.-beta3) * w_avg[i], w_avg[i]) ws_avg += [w_avg] return new, ws_avg
Example #28
Source File: optim.py From iaf with MIT License | 5 votes |
def AdaMaxAvg(ws, ws_avg, objective, alpha=.01, beta1=.1, beta2=.001, update_keys=None, disconnected_inputs='raise'): print 'AdaMax_Avg', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2 gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs=disconnected_inputs) #warn/raise if update_keys is None: update_keys = [ws[j].keys() for j in range(len(ws))] new = OrderedDict() for j in range(len(ws)): if ws_avg is not None: w_avg = ws_avg[j] for i in update_keys[j]: _w = ws[j][i] _g = gs[j][i] #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's mom1 = G.sharedf(_w.get_value() * 0.) _max = G.sharedf(_w.get_value() * 0. + 1e-8) new[mom1] = (1-beta1) * mom1 + beta1 * _g new[_max] = T.maximum((1-beta2)*_max, abs(_g) + 1e-8) new[_w] = _w + alpha * new[mom1] / new[_max] if ws_avg is not None: new[w_avg[i]] = beta2 * _w + (1.-beta2) * w_avg[i] return new # Eve that keeps running average of parameter
Example #29
Source File: optim.py From iaf with MIT License | 5 votes |
def AdaMax2(w, objective, alpha=.01, beta1=.1, beta2=.001, n_accum=2): print 'AdaMax2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2, 'n_accum:', n_accum g = T.grad(objective.sum(), w, disconnected_inputs='warn') new = OrderedDict() from theano.ifelse import ifelse it = G.sharedf(0.) new[it] = it + 1 reset = T.eq(T.mod(new[it],n_accum), 0) update = T.eq(T.mod(new[it],n_accum), n_accum-1) for i in range(len(w)): mom1 = G.sharedf(w[i].get_value() * 0.) _max = G.sharedf(w[i].get_value() * 0.) g_sum = G.sharedf(w[i].get_value() * 0.) #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's new[g_sum] = ifelse(reset, g[i], g_sum + g[i]) new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1) new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max) new[w[i]] = ifelse(update, w[i] + alpha * new[mom1] / new[_max], w[i]) return new # AdaMax that keeps running average of parameter
Example #30
Source File: activation.py From OpenDeep with Apache License 2.0 | 5 votes |
def elu(x, alpha=1): """ (from Lasagne https://github.com/Lasagne/Lasagne/blob/master/lasagne/nonlinearities.py) Exponential Linear Unit :math:`\\varphi(x) = (x > 0) ? x : e^x - 1` The Exponential Linear Unit (EUL) was introduced in [1]_. Compared to the linear rectifier :func:`rectify`, it has a mean activation closer to zero and nonzero gradient for negative input, which can help convergence. Compared to the leaky rectifier, it saturates for highly negative inputs. Parameters ---------- x : float32 The activation (the summed, weighed input of a neuron). Returns ------- float32 The output of the exponential linear unit for the activation. References ---------- .. [1] Djork-Arne Clevert, Thomas Unterthiner, Sepp Hochreiter (2015): Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs), http://arxiv.org/abs/1511.07289 """ assert alpha > 0, "alpha parameter to ELU has to be > 0." return T.switch(x > 0, x, alpha*(T.exp(x) - 1))