Python Examples of keras.backend.softmax

Source File: capsule.py From Keras-TextClassification with MIT License

6 votes

def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        outputs = None
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

Source File: augmented_model.py From tying-wv-and-wc with MIT License

6 votes

def augmented_loss(self, y_true, y_pred):
        _y_pred = Activation("softmax")(y_pred)
        loss = K.categorical_crossentropy(_y_pred, y_true)

        # y is (batch x seq x vocab)
        y_indexes = K.argmax(y_true, axis=2)  # turn one hot to index. (batch x seq)
        y_vectors = self.embedding(y_indexes)  # lookup the vector (batch x seq x vector_length)

        #v_length = self.setting.vector_length
        #y_vectors = K.reshape(y_vectors, (-1, v_length))
        #y_t = K.map_fn(lambda v: K.dot(self.embedding.embeddings, K.reshape(v, (-1, 1))), y_vectors)
        #y_t = K.squeeze(y_t, axis=2)  # unknown but necessary operation
        #y_t = K.reshape(y_t, (-1, self.sequence_size, self.vocab_size))

        # vector x embedding dot products (batch x seq x vocab)
        y_t = tf.tensordot(y_vectors, K.transpose(self.embedding.embeddings), 1)
        y_t = K.reshape(y_t, (-1, self.sequence_size, self.vocab_size))  # explicitly set shape
        y_t = K.softmax(y_t / self.temperature)
        _y_pred_t = Activation("softmax")(y_pred / self.temperature)
        aug_loss = kullback_leibler_divergence(y_t, _y_pred_t)
        loss += (self.gamma * self.temperature) * aug_loss
        return loss

Source File: augmented_model.py From tying-wv-and-wc with MIT License

6 votes

def __init__(self, 
        vocab_size, 
        sequence_size,
        setting=None,
        checkpoint_path="",
        temperature=10,
        tying=False):

        super().__init__(vocab_size, sequence_size, setting, checkpoint_path)
        self.temperature = temperature
        self.tying = tying
        self.gamma = self.setting.gamma

        if tying:
            self.model.pop()  # remove activation
            self.model.pop()  # remove projection (use self embedding)
            self.model.add(Lambda(lambda x: K.dot(x, K.transpose(self.embedding.embeddings))))
            self.model.add(Activation("softmax"))

Source File: attention.py From keras-utility-layer-collection with MIT License

6 votes

def step(self, x, states):  
        h = states[0]
        # states[1] necessary?
        
        # comes from the constants
        X_static = states[-2]
        # equals K.dot(static_x, self._W1) + self._b2 with X.shape=[bs, L, static_input_dim]
        total_x_static_prod = states[-1]

        # expand dims to add the vector which is only valid for this time step
        # to total_x_prod which is valid for all time steps
        hw = K.expand_dims(K.dot(h, self._W2), 1)
        additive_atn = total_x_static_prod + hw
        attention = K.softmax(K.dot(additive_atn, self._V), axis=1)
        static_x_weighted = K.sum(attention * X_static, [1])
        
        x = K.dot(K.concatenate([x, static_x_weighted], 1), self._W3) + self._b3

        h, new_states = self.layer.cell.call(x, states[:-2])
        
        # append attention to the states to "smuggle" it out of the RNN wrapper
        attention = K.squeeze(attention, -1)
        h = K.concatenate([h, attention])

        return h, new_states

Source File: attention.py From keras-utility-layer-collection with MIT License

6 votes

def step(self, x, states):   
        h = states[0]
        # states[1] necessary?

        # equals K.dot(X, self._W1) + self._b2 with X.shape=[bs, T, input_dim]
        total_x_prod = states[-1]
        # comes from the constants (equals the input sequence)
        X = states[-2]
        
        # expand dims to add the vector which is only valid for this time step
        # to total_x_prod which is valid for all time steps
        hw = K.expand_dims(K.dot(h, self._W2), 1)
        additive_atn = total_x_prod + hw
        attention = K.softmax(K.dot(additive_atn, self._V), axis=1)
        x_weighted = K.sum(attention * X, [1])

        x = K.dot(K.concatenate([x, x_weighted], 1), self._W3) + self._b3
        
        h, new_states = self.layer.cell.call(x, states[:-2])
        
        return h, new_states

Source File: self_attention.py From nlp_toolkit with MIT License

6 votes

def Mask(self, inputs, seq_len, mode='mul'):
        """
        # Arguments:
            inputs: input tensor with shape (batch_size, seq_len, input_size)
            seq_len: Each sequence's actual length with shape (batch_size,)
            mode:
                mul: mask the rest dim with zero, used before fully-connected layer
                add: subtract a big constant from the rest, used before softmax layer
        # Reutrns:
            Masked tensors with the same shape of input tensor
        """
        if seq_len is None:
            return inputs
        else:
            mask = K.one_hot(seq_len[:, 0], K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, 1)
            for _ in range(len(inputs.shape) - 2):
                mask = K.expand_dims(mask, 2)
            if mode == 'mul':
                return inputs * mask
            if mode == 'add':
                return inputs - (1 - mask) * 1e12

Source File: multi_dim_attention.py From nlp_toolkit with MIT License

6 votes

def call(self, x, mask=None):
        uit = K.tanh(K.dot(x, self.Ws1))
        ait = K.dot(uit, self.Ws2)
        ait = K.permute_dimensions(ait, (0, 2, 1))
        A = K.softmax(ait, axis=1)
        M = K.batch_dot(A, x)
        if self.punish:
            A_T = K.permute_dimensions(A, (0, 2, 1))
            tile_eye = K.tile(K.eye(self.weight_ws2), [self.batch_size, 1])
            tile_eye = K.reshape(
                tile_eye, shape=[-1, self.weight_ws2, self.weight_ws2])
            AA_T = K.batch_dot(A, A_T) - tile_eye
            P = K.l2_normalize(AA_T, axis=(1, 2))
            return M, P
        else:
            return M

Source File: neural_networks.py From Quora with MIT License

6 votes

def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))    # noqa
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]  # noqa

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]  # noqa
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]    # noqa
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))    # noqa
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])
        return outputs

Source File: submission_v50.py From Quora with MIT License

6 votes

def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))    # noqa
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]  # noqa

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]  # noqa
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]    # noqa
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))    # noqa
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])
        return outputs

Source File: memory.py From NTM-Keras with MIT License

6 votes

def content_addressing(memory_t,  key_vector_t, key_strength_t):
    '''
    Focusing by content.
    :param memory_t: external memory.
    :param key_vector_t: key vector.
    :param key_strength_t: the strength of key.
    :return:
    '''
    # print("content addressing:")
    # print(">>memory_t")
    # print(key_vector_t)
    # print(">>key_vector_t")
    # print(key_vector_t)
    # print(">>key_strength_t")
    # print(key_strength_t)
    _weight_content_t = \
        key_strength_t * cosine_similarity_group(key_vector_t, memory_t)
    weight_content_t = softmax(_weight_content_t)
    # print("_weight_content_t")
    # print(_weight_content_t)
    return weight_content_t

Source File: ntm.py From ntm_keras with BSD 3-Clause "New" or "Revised" License

6 votes

def _get_weight_vector(self, M, w_tm1, k, beta, g, s, gamma):
#        M = tf.Print(M, [M, w_tm1, k], message='get weights beg1: ')
#        M = tf.Print(M, [beta, g, s, gamma], message='get weights beg2: ')
        # Content adressing, see Chapter 3.3.1:
        num = beta * _cosine_distance(M, k)
        w_c  = K.softmax(num) # It turns out that equation (5) is just softmax.
        # Location adressing, see Chapter 3.3.2:
        # Equation 7:
        w_g = (g * w_c) + (1-g)*w_tm1
        # C_s is the circular convolution
        #C_w = K.sum((self.C[None, :, :, :] * w_g[:, None, None, :]),axis=3)
        # Equation 8:
        # TODO: Explain
        C_s = K.sum(K.repeat_elements(self.C[None, :, :, :], self.batch_size, axis=0) * s[:,:,None,None], axis=1)
        w_tilda = K.batch_dot(C_s, w_g)
        # Equation 9:
        w_out = _renorm(w_tilda ** gamma)

        return w_out

Source File: temporal_mean_rate_theano.py From snn_toolbox with MIT License

6 votes

def softmax_activation(self, mem):
        """Softmax activation."""

        # spiking_samples = k.less_equal(k.random_uniform([self.config.getint(
        #     'simulation', 'batch_size'), 1]), 300 * self.dt / 1000.)
        # spiking_neurons = k.T.repeat(spiking_samples, 10, axis=1)
        # activ = k.T.nnet.softmax(mem)
        # max_activ = k.max(activ, axis=1, keepdims=True)
        # output_spikes = k.equal(activ, max_activ).astype(k.floatx())
        # output_spikes = k.T.set_subtensor(output_spikes[k.equal(
        #     spiking_neurons, 0).nonzero()], 0.)
        # new_and_reset_mem = k.T.set_subtensor(mem[spiking_neurons.nonzero()],
        #                                       0.)
        # self.add_update([(self.mem, new_and_reset_mem)])
        # return output_spikes

        return k.T.mul(k.less_equal(k.random_uniform(mem.shape),
                                    k.softmax(mem)), self.v_thresh)

Source File: temporal_mean_rate_theano.py From snn_toolbox with MIT License

6 votes

def set_reset_mem(self, mem, spikes):
        """
        Reset membrane potential ``mem`` array where ``spikes`` array is
        nonzero.
        """

        spike_idxs = k.T.nonzero(spikes)
        if (hasattr(self, 'activation_str') and
                self.activation_str == 'softmax'):
            new = mem.copy()  # k.T.set_subtensor(mem[spike_idxs], 0.)
        elif self.config.get('cell', 'reset') == 'Reset by subtraction':
            if self.payloads:  # Experimental.
                new = k.T.set_subtensor(mem[spike_idxs], 0.)
            else:
                pos_spike_idxs = k.T.nonzero(k.greater(spikes, 0))
                neg_spike_idxs = k.T.nonzero(k.less(spikes, 0))
                new = k.T.inc_subtensor(mem[pos_spike_idxs], -self.v_thresh)
                new = k.T.inc_subtensor(new[neg_spike_idxs], self.v_thresh)
        elif self.config.get('cell', 'reset') == 'Reset by modulo':
            new = k.T.set_subtensor(mem[spike_idxs],
                                    mem[spike_idxs] % self.v_thresh)
        else:  # self.config.get('cell', 'reset') == 'Reset to zero':
            new = k.T.set_subtensor(mem[spike_idxs], 0.)
        self.add_update([(self.mem, new)])

Source File: models.py From voxelmorph with GNU General Public License v3.0

6 votes

def _softmax(x, axis=-1, alpha=1):
    """
    building on keras implementation, allow alpha parameter

    Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
        alpha: a value to multiply all x
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    x = alpha * x
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

Source File: models.py From DigiX_HuaWei_Population_Age_Attribution_Predict with MIT License

6 votes

def mlp_v2():
    model = Sequential()
    model.add(Dense(2048, input_shape=(21099,)))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
#     model.add(Dense(1024))
#     model.add(Activation('relu'))
#     model.add(Dropout(0.5))   
#     model.add(BatchNormalization())
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))   
    model.add(BatchNormalization())
    model.add(Dense(128))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))    
    model.add(BatchNormalization())
    model.add(Dense(6))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='nadam',
                  metrics=['accuracy'])
    return model

Source File: models.py From DigiX_HuaWei_Population_Age_Attribution_Predict with MIT License

6 votes

def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

Source File: models.py From DigiX_HuaWei_Population_Age_Attribution_Predict with MIT License

6 votes

def CapsuleNet(n_capsule = 10, n_routings = 5, capsule_dim = 16,
     n_recurrent=100, dropout_rate=0.2, l2_penalty=0.0001):
    K.clear_session()

    inputs = Input(shape=(170,))
    x = Embedding(21099, 300,  trainable=True)(inputs)        
    x = SpatialDropout1D(dropout_rate)(x)
    x = Bidirectional(
        CuDNNGRU(n_recurrent, return_sequences=True,
                 kernel_regularizer=l2(l2_penalty),
                 recurrent_regularizer=l2(l2_penalty)))(x)
    x = PReLU()(x)
    x = Capsule(
        num_capsule=n_capsule, dim_capsule=capsule_dim,
        routings=n_routings, share_weights=True)(x)
    x = Flatten(name = 'concatenate')(x)
    x = Dropout(dropout_rate)(x)
#     fc = Dense(128, activation='sigmoid')(x)
    outputs = Dense(6, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
    return model

Source File: seq2seq_attention.py From stock-price-predict with MIT License

6 votes

def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

Source File: seq2seq_attention_2.py From stock-price-predict with MIT License

6 votes

def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

Source File: attention.py From keras-transformer with MIT License

6 votes

def mask_attention_if_needed(self, dot_product):
        """
        Makes sure that (when enabled) each position
        (of a decoder's self-attention) cannot attend to subsequent positions.
        This is achieved by assigning -inf (or some large negative number)
        to all invalid connections. Later softmax will turn them into zeros.
        We need this to guarantee that decoder's predictions are based
        on what has happened before the position, not after.
        The method does nothing if masking is turned off.
        :param dot_product: scaled dot-product of Q and K after reshaping them
        to 3D tensors (batch * num_heads, rows, cols)
        """
        if not self.use_masking:
            return dot_product
        last_dims = K.int_shape(dot_product)[-2:]
        low_triangle_ones = (
            np.tril(np.ones(last_dims))
            # to ensure proper broadcasting
            .reshape((1,) + last_dims))
        inverse_low_triangle = 1 - low_triangle_ones
        close_to_negative_inf = -1e9
        result = (
            K.constant(low_triangle_ones, dtype=K.floatx()) * dot_product +
            K.constant(close_to_negative_inf * inverse_low_triangle))
        return result

Source File: learn_labelembedding.py From semantic-embeddings with MIT License

6 votes

def labelembed_loss(out1, out2, tar, targets, tau = 2., alpha = 0.9, beta = 0.5, num_classes = 100):
    
    out2_prob = K.softmax(out2)
    tau2_prob = K.stop_gradient(K.softmax(out2 / tau))
    soft_tar = K.stop_gradient(K.softmax(tar))
    
    L_o1_y = K.sparse_categorical_crossentropy(output = K.softmax(out1), target = targets)
    
    pred = K.argmax(out2, axis = -1)
    mask = K.stop_gradient(K.cast(K.equal(pred, K.cast(targets, 'int64')), K.floatx()))
    L_o1_emb = -cross_entropy(out1, soft_tar)  # pylint: disable=invalid-unary-operand-type
    
    L_o2_y = K.sparse_categorical_crossentropy(output = out2_prob, target = targets)
    L_emb_o2 = -cross_entropy(tar, tau2_prob) * mask * (K.cast(K.shape(mask)[0], K.floatx())/(K.sum(mask)+1e-8))  # pylint: disable=invalid-unary-operand-type
    L_re = K.relu(K.sum(out2_prob * K.one_hot(K.cast(targets, 'int64'), num_classes), axis = -1) - alpha)
    
    return beta * L_o1_y + (1-beta) * L_o1_emb + L_o2_y + L_emb_o2 + L_re