Python Examples of theano.tensor.switch

Source File: layers.py From DL4MT with BSD 3-Clause "New" or "Revised" License

6 votes

def shared_dropout_layer(shape, use_noise, trng, value, scaled=True):
    #re-scale dropout at training time, so we don't need to at test time
    if scaled:
        proj = tensor.switch(
            use_noise,
            trng.binomial(shape, p=value, n=1,
                                        dtype='float32')/value,
            theano.shared(numpy.float32(1.)))
    else:
        proj = tensor.switch(
            use_noise,
            trng.binomial(shape, p=value, n=1,
                                        dtype='float32'),
            theano.shared(numpy.float32(value)))
    return proj


# feedforward layer: affine transformation + point-wise nonlinearity

Source File: util.py From gated-graph-transformer-network with MIT License

6 votes

def reduce_log_sum(tensor, axis=None, guaranteed_finite=False):
    """
    Sum probabilities in the log domain, i.e return
        log(e^vec[0] + e^vec[1] + ...)
        = log(e^x e^(vec[0]-x) + e^x e^(vec[1]-x) + ...)
        = log(e^x [e^(vec[0]-x) + e^(vec[1]-x) + ...])
        = log(e^x) + log(e^(vec[0]-x) + e^(vec[1]-x) + ...)
        = x + log(e^(vec[0]-x) + e^(vec[1]-x) + ...)
    For numerical stability, we choose x = max(vec)
    Note that if x is -inf, that means all values are -inf,
    so the answer should be -inf. In this case, choose x = 0
    """
    maxval = T.max(tensor, axis)
    maxval_full = T.max(tensor, axis, keepdims=True)
    if not guaranteed_finite:
        maxval = T.switch(T.isfinite(maxval), maxval, T.zeros_like(maxval))
        maxval_full = T.switch(T.isfinite(maxval_full), maxval_full, T.zeros_like(maxval_full))
    reduced_sum = T.sum(T.exp(tensor - maxval_full), axis)
    logsum = maxval + T.log(reduced_sum)
    return logsum

Source File: optim.py From iaf with MIT License

6 votes

def AdaMax(w, objective, alpha=.01, beta1=.1, beta2=.001):
    print 'AdaMax', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2
    g = T.grad(objective.sum(), w, disconnected_inputs='warn')
    
    new = OrderedDict()
    
    for i in range(len(w)):
        #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's
        mom1 = G.sharedf(w[i].get_value() * 0.)
        _max = G.sharedf(w[i].get_value() * 0.)
        new[mom1] = (1-beta1) * mom1 + beta1 * g[i]
        new[_max] = T.maximum((1-beta2)*_max, abs(g[i]) + 1e-8)
        new[w[i]] = w[i] + alpha *  new[mom1] / new[_max]
                
    return new

# AdaMax that averages over multiple minibatches

Source File: theano_backend.py From GraphicDesignPatternByPython with MIT License

6 votes

def switch(condition, then_expression, else_expression):
    """Switches between two operations depending on a scalar value.

    Note that both `then_expression` and `else_expression`
    should be symbolic tensors of the *same shape*.

    # Arguments
        condition: scalar tensor (`int` or `bool`).
        then_expression: either a tensor, or a callable that returns a tensor.
        else_expression: either a tensor, or a callable that returns a tensor.

    # Returns
        The selected tensor.
    """
    if callable(then_expression):
        then_expression = then_expression()
    if callable(else_expression):
        else_expression = else_expression()
    cond_ndim = ndim(condition)
    expr_ndim = ndim(then_expression)
    if cond_ndim < expr_ndim:
        ndim_diff = expr_ndim - cond_ndim
        for _ in range(ndim_diff):
            condition = expand_dims(condition)
    return T.switch(condition, then_expression, else_expression)

Source File: base.py From carl with BSD 3-Clause "New" or "Revised" License

6 votes

def bound(expression, out, *predicates):
    """Bound a theano expression.

    Parameters
    ----------
    * `expression` [theano expression]:
        The expression to bound.

    * `out` [theano expression]:
        The out-of-bounds value.

    * `*predicates` [list of theano expressions]:
        The list of predicates defining the boundaries of `expression`.

    Returns
    -------
    * `value` [theano expression]:
         The bounded expression.
    """
    guard = 1

    for p in predicates:
        guard *= p

    return T.switch(guard, expression, out)

Source File: keras_extensions.py From visual_turing_test-tutorial with MIT License

6 votes

def time_distributed_nonzero_max_pooling(x):
    """
    Computes maximum along the first (time) dimension.
    It ignores the mask m.

    In:
        x - input; a 3D tensor
        mask_value - value to mask out, if None then no masking; 
            by default 0.0, 
    """

    import theano.tensor as T

    mask_value=0.0
    x = T.switch(T.eq(x, mask_value), -numpy.inf, x)
    masked_max_x = x.max(axis=1)
    # replace infinities with mask_value
    masked_max_x = T.switch(T.eq(masked_max_x, -numpy.inf), 0, masked_max_x)
    return masked_max_x

Source File: keras_extensions.py From visual_turing_test-tutorial with MIT License

6 votes

def time_distributed_masked_max(x, m):
    """
    Computes max along the first (time) dimension.

    In:
        x - input; a 3D tensor
        m - mask
        m_value - value for masking
    """
    # place infinities where mask is off
    m_value = 0.0
    tmp = K.switch(K.equal(m, 0.0), -numpy.inf, 0.0)
    x_with_inf = x + K.expand_dims(tmp)
    x_max = K.max(x_with_inf, axis=1) 
    r = K.switch(K.equal(x_max, -numpy.inf), m_value, x_max)
    return r 


## classes  ##

# Transforms existing layers to masked layers

Source File: toolbox.py From Theano-Lights with MIT License

6 votes

def sgdmgc(cost, params, lr=1.0, alpha=0.1, max_magnitude=5.0, infDecay=0.1):
    """SGD with momentum and gradient clipping"""
    grads = T.grad(cost=cost, wrt=params)
    updates = []

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.)

    for p, g in zip(params, grads):
        v = shared(p.get_value() * 0.)
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        v_new = v * (1.0 - alpha) - alpha * lr * g
        updates.append((v, v_new))
        updates.append((p, p + v_new ))
    
    return updates, norm

Source File: __init__.py From adversarial with BSD 3-Clause "New" or "Revised" License

6 votes

def __init__(self, scale_grads=1, target_scale=.1,
            discriminator_default_input_include_prob = 1.,
            discriminator_input_include_probs=None,
            discriminator_default_input_scale=1.,
            discriminator_input_scales=None,
            generator_default_input_include_prob = 1.,
            generator_default_input_scale=1.,
            inference_default_input_include_prob=None,
            inference_input_include_probs=None,
            inference_default_input_scale=1.,
            inference_input_scales=None,
            init_now_train_generator=True,
            ever_train_discriminator=True,
            ever_train_generator=True,
            ever_train_inference=True,
            no_drop_in_d_for_g=False,
            alternate_g = False):
        self.__dict__.update(locals())
        del self.self
        # These allow you to dynamically switch off training parts.
        # If the corresponding ever_train_* is False, these have
        # no effect.
        self.now_train_generator = sharedX(init_now_train_generator)
        self.now_train_discriminator = sharedX(numpy.array(1., dtype='float32'))
        self.now_train_inference = sharedX(numpy.array(1., dtype='float32'))

Source File: PoolLayer.py From NSC with MIT License

5 votes

def __init__(self, input, rate, istrain):
        rate = numpy.float32(rate)
        self.input = input
        srng = T.shared_randomstreams.RandomStreams()
        mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32')
        self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate))
        self.params = []

Source File: ff_layers.py From LV_groundhog with BSD 3-Clause "New" or "Revised" License

5 votes

def fprop(self, all_states):
        if self.ntimes:
            stateshape0 = all_states.shape[0]
            shape0 = TT.switch(TT.gt(self.n, 0), self.n, all_states.shape[0])

            single_frame = TT.shape_padleft(all_states[stateshape0-1])
            mask = TT.alloc(numpy.float32(1), shape0, *[1 for k in xrange(all_states.ndim-1)])
            rval = single_frame * mask
            self.out = rval
            return rval

        single_frame = all_states[all_states.shape[0]-1]
        self.out = single_frame
        return single_frame

Source File: gru4rec.py From sars_tutorial with MIT License

5 votes

def execute(self, X):
            return self.lmbd * T.switch(T.ge(X, 0), X, self.alpha * (T.exp(X) - 1))

Source File: gru4rec.py From sars_tutorial with MIT License

5 votes

def execute(self, X):
            return T.switch(T.ge(X, 0), X, self.alpha * (T.exp(X) - 1))

Source File: exponential.py From carl with BSD 3-Clause "New" or "Revised" License

5 votes

def __init__(self, inverse_scale=1.0):
        """Constructor.

        Parameters
        ----------
        * `inverse_scale` [float]:
            The inverse scale.
        """
        super(Exponential, self).__init__(inverse_scale=inverse_scale)

        # pdf
        self.pdf_ = T.switch(
            T.lt(self.X, 0.),
            0.,
            self.inverse_scale * T.exp(-self.inverse_scale * self.X)).ravel()
        self._make(self.pdf_, "pdf")

        # -log pdf
        self.nll_ = bound(
            -T.log(self.inverse_scale) + self.inverse_scale * self.X,
            np.inf,
            self.inverse_scale > 0.).ravel()
        self._make(self.nll_, "nll")

        # cdf
        self.cdf_ = (1. - T.exp(-self.inverse_scale * self.X)).ravel()
        self._make(self.cdf_, "cdf")

        # ppf
        self.ppf_ = -T.log(1. - self.p) / self.inverse_scale
        self._make(self.ppf_, "ppf", args=[self.p])

Source File: gram.py From gram with BSD 3-Clause "New" or "Revised" License

5 votes

def dropout_layer(state_before, use_noise, trng, prob):
    proj = T.switch(use_noise, (state_before * trng.binomial(state_before.shape, p=prob, n=1, dtype=state_before.dtype)), state_before * 0.5)
    return proj

Source File: helpers.py From deep-prior with GNU General Public License v3.0

5 votes

def TruncLin(x):
    """
    Truncated linear unit
    :param x: input value
    :return: max(min(x,1),-1)
    """
    import theano.tensor as T
    return T.switch(x < -1, -1, T.switch(x > 1, 1, x))

Source File: PoolLayer.py From NSC with MIT License

5 votes

def __init__(self, input, rate, istrain):
        rate = numpy.float32(rate)
        self.input = input
        srng = T.shared_randomstreams.RandomStreams()
        mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32')
        self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate))
        self.params = []

Source File: PoolLayer.py From NSC with MIT License

5 votes

def __init__(self, input, rate, istrain):
        rate = numpy.float32(rate)
        self.input = input
        srng = T.shared_randomstreams.RandomStreams()
        mask = srng.binomial(n=1, p=numpy.float32(1-rate), size=input.shape, dtype='float32')
        self.output = T.switch(istrain, mask*self.input, self.input*numpy.float32(1-rate))
        self.params = []

Source File: helpers.py From deep-prior with GNU General Public License v3.0

5 votes

def InvReLU(x):
    """
    Rectified linear unit
    :param x: input value
    :return: max(x,0)
    """
    import theano.tensor as T
    x *= -1.
    return T.switch(x < 0, 0, x)

Source File: simple.py From attention-lvcsr with MIT License

5 votes

def apply(self, input_):
        return tensor.switch(input_ > 0, input_, 0)

Source File: toolbox.py From Theano-Lights with MIT License

5 votes

def adamgc(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)
    
    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m) 
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm

#--------------------------------------------------------------------------------------------------

Source File: toolbox.py From Theano-Lights with MIT License

5 votes

def adamgc_(cost, params, lr=0.0002, b1=0.1, b2=0.01, e=1e-8, max_magnitude=5.0, infDecay=0.1):
    updates = []
    grads = T.grad(cost, params)
    
    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.)

    i = shared(floatX(0.))
    i_t = i + 1.
    fix1 = 1. - (1. - b1)**i_t
    fix2 = 1. - (1. - b2)**i_t
    lr_t = lr * (T.sqrt(fix2) / fix1)
    for p, g in zip(params, grads):
        g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        m = shared(p.get_value() * 0.)
        v = shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m) 
        v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (T.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)

        #e_t = shared(p.get_value() * 0.)
        #de_t = (srnd.normal(p.shape, std = 0.05, dtype=theano.config.floatX)*p_t - e_t)*0.05  #*p_t
        #p_t = p_t + de_t
        #updates.append((e_t, e_t + de_t))

        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    return updates, norm

Source File: toolbox.py From Theano-Lights with MIT License

5 votes

def sgdgc(cost, params, lr=1.0, max_magnitude=5.0, infDecay=0.1):
    """SGD with gradient clipping"""
    grads = T.grad(cost=cost, wrt=params)
    updates = []

    norm = norm_gs(params, grads)
    sqrtnorm = T.sqrt(norm)
    #not_finite = T.or_(T.isnan(sqrtnorm), T.isinf(sqrtnorm))
    adj_norm_gs = T.switch(T.ge(sqrtnorm, max_magnitude), max_magnitude / sqrtnorm, 1.)

    for p, g in zip(params, grads):
        #g = T.switch(not_finite, infDecay * p, g * adj_norm_gs)
        updates.append((p, p - lr * g * adj_norm_gs))  
    
    return updates, norm

Source File: regression.py From Diffusion-Probabilistic-Models with MIT License

5 votes

def apply(self, input_):
        return T.switch(input_ > 0, input_, 0.05*input_)

Source File: session_encdec.py From hred-qs with BSD 3-Clause "New" or "Revised" License

5 votes

def compute_updates(self, training_cost, params):
        updates = []
        grads = T.grad(training_cost, params)
        grads = OrderedDict(zip(params, grads))
        # Clip stuff
        c = numpy.float32(self.cutoff)
        clip_grads = []

        norm_gs = T.sqrt(sum(T.sum(g ** 2) for p, g in grads.items()))
        normalization = T.switch(T.ge(norm_gs, c), c / norm_gs, np.float32(1.))
        notfinite = T.or_(T.isnan(norm_gs), T.isinf(norm_gs))
        for p, g in grads.items():
            clip_grads.append((p, T.switch(notfinite, numpy.float32(.1) * p, g * normalization)))

        grads = OrderedDict(clip_grads)

        if self.updater == 'adagrad':
            updates = Adagrad(grads, self.lr)
        elif self.updater == 'sgd':
            raise Exception("Sgd not implemented!")
        elif self.updater == 'adadelta':
            updates = Adadelta(grads)
        elif self.updater == 'rmsprop':
            updates = RMSProp(grads, self.lr)
        elif self.updater == 'adam':
            updates = Adam(grads)
        else:
            raise Exception("Updater not understood!")
        return updates

Source File: updates.py From dcgan_code with MIT License

5 votes

def clip_norm(g, c, n):
    if c > 0:
        g = T.switch(T.ge(n, c), g*c/n, g)
    return g

Source File: optim.py From iaf with MIT License

5 votes

def AdaMaxAvg2(ws, objective, alpha=.01, beta1=.1, beta2=.001, beta3=0.01, n_accum=1):
    if n_accum == 1:
        return AdaMaxAvg(ws, objective, alpha, beta1, beta2, beta3)
    print 'AdaMax_Avg2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2,'beta3:',beta3,'n_accum:',n_accum
    
    gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs='raise')

    new = OrderedDict()
    
    from theano.ifelse import ifelse
    it = G.sharedf(0.)
    new[it] = it + 1
    reset = T.eq(T.mod(it,n_accum), 0)
    update = T.eq(T.mod(it,n_accum), n_accum-1)
    
    ws_avg = []
    for j in range(len(ws)):
        w_avg = {}
        for i in ws[j]:
            _w = ws[j][i]
            _g = gs[j][i]
            #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's
            mom1 = G.sharedf(_w.get_value() * 0.)
            _max = G.sharedf(_w.get_value() * 0.)
            w_avg[i] = G.sharedf(_w.get_value())
            g_sum = G.sharedf(_w.get_value() * 0.)
        
            new[g_sum] = ifelse(reset, _g, g_sum + _g)
            new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1)
            new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max)
            new[_w] = ifelse(update, _w + alpha *  new[mom1] / new[_max], _w)
            new[w_avg[i]] = ifelse(update, beta3 * new[_w] + (1.-beta3) * w_avg[i], w_avg[i])
        ws_avg += [w_avg]   
    return new, ws_avg

Source File: optim.py From iaf with MIT License

5 votes

def AdaMaxAvg(ws, ws_avg, objective, alpha=.01, beta1=.1, beta2=.001, update_keys=None, disconnected_inputs='raise'):
    print 'AdaMax_Avg', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2
    
    gs = G.ndict.T_grad(objective.sum(), ws, disconnected_inputs=disconnected_inputs) #warn/raise
    
    if update_keys is None:
        update_keys = [ws[j].keys() for j in range(len(ws))]
    
    new = OrderedDict()
    for j in range(len(ws)):
        if ws_avg is not None:
            w_avg = ws_avg[j]
        for i in update_keys[j]:
            _w = ws[j][i]
            _g = gs[j][i]
            #_g = T.switch(T.isnan(_g),T.zeros_like(_g),_g) #remove NaN's
            mom1 = G.sharedf(_w.get_value() * 0.)
            _max = G.sharedf(_w.get_value() * 0. + 1e-8)
            
            new[mom1] = (1-beta1) * mom1 + beta1 * _g
            new[_max] = T.maximum((1-beta2)*_max, abs(_g) + 1e-8)
            new[_w] = _w + alpha *  new[mom1] / new[_max]
            if ws_avg is not None:
                new[w_avg[i]] = beta2 * _w + (1.-beta2) * w_avg[i]
    return new

# Eve that keeps running average of parameter

Source File: optim.py From iaf with MIT License

5 votes

def AdaMax2(w, objective, alpha=.01, beta1=.1, beta2=.001, n_accum=2):
    print 'AdaMax2', 'alpha:',alpha,'beta1:',beta1,'beta2:',beta2, 'n_accum:', n_accum
    g = T.grad(objective.sum(), w, disconnected_inputs='warn')
    
    new = OrderedDict()
    
    from theano.ifelse import ifelse
    it = G.sharedf(0.)
    new[it] = it + 1
    reset = T.eq(T.mod(new[it],n_accum), 0)
    update = T.eq(T.mod(new[it],n_accum), n_accum-1)

    for i in range(len(w)):
        mom1 = G.sharedf(w[i].get_value() * 0.)
        _max = G.sharedf(w[i].get_value() * 0.)
        g_sum = G.sharedf(w[i].get_value() * 0.)
        
        #gi = T.switch(T.isnan(gi),T.zeros_like(gi),gi) #remove NaN's
        new[g_sum] = ifelse(reset, g[i], g_sum + g[i])
        new[mom1] = ifelse(update, (1-beta1) * mom1 + beta1 * new[g_sum], mom1)
        new[_max] = ifelse(update, T.maximum((1-beta2)*_max, abs(new[g_sum]) + 1e-8), _max)
        new[w[i]] = ifelse(update, w[i] + alpha *  new[mom1] / new[_max], w[i])
                
    return new

# AdaMax that keeps running average of parameter

Source File: activation.py From OpenDeep with Apache License 2.0

5 votes

def elu(x, alpha=1):
    """
    (from Lasagne https://github.com/Lasagne/Lasagne/blob/master/lasagne/nonlinearities.py)

    Exponential Linear Unit :math:`\\varphi(x) = (x > 0) ? x : e^x - 1`
    The Exponential Linear Unit (EUL) was introduced in [1]_. Compared to the
    linear rectifier :func:`rectify`, it has a mean activation closer to zero
    and nonzero gradient for negative input, which can help convergence.
    Compared to the leaky rectifier, it saturates for
    highly negative inputs.

    Parameters
    ----------
    x : float32
        The activation (the summed, weighed input of a neuron).
    Returns
    -------
    float32
        The output of the exponential linear unit for the activation.

    References
    ----------
    .. [1] Djork-Arne Clevert, Thomas Unterthiner, Sepp Hochreiter (2015):
       Fast and Accurate Deep Network Learning by Exponential Linear Units
       (ELUs), http://arxiv.org/abs/1511.07289
    """
    assert alpha > 0, "alpha parameter to ELU has to be > 0."
    return T.switch(x > 0, x, alpha*(T.exp(x) - 1))

Python theano.tensor.switch() Examples