Python Examples of allennlp.nn.util.sequence_cross_entropy_with

Source File: util_test.py From allennlp with Apache License 2.0

6 votes

def test_sequence_cross_entropy_with_logits_smooths_labels_correctly(self):
        tensor = torch.rand([1, 3, 4])
        targets = torch.LongTensor(numpy.random.randint(0, 3, [1, 3]))

        weights = torch.ones([2, 3])
        loss = util.sequence_cross_entropy_with_logits(
            tensor, targets, weights, label_smoothing=0.1
        )

        correct_loss = 0.0
        for prediction, label in zip(tensor.squeeze(0), targets.squeeze(0)):
            prediction = torch.nn.functional.log_softmax(prediction, dim=-1)
            correct_loss += prediction[label] * 0.9
            # incorrect elements
            correct_loss += prediction.sum() * 0.1 / 4
        # Average over sequence.
        correct_loss = -correct_loss / 3
        numpy.testing.assert_array_almost_equal(loss.data.numpy(), correct_loss.data.numpy())

Source File: util_test.py From allennlp with Apache License 2.0

6 votes

def test_sequence_cross_entropy_with_logits_averages_batch_correctly(self):
        # test batch average is the same as dividing the batch averaged
        # loss by the number of batches containing any non-padded tokens.
        tensor = torch.rand([5, 7, 4])
        tensor[0, 3:, :] = 0
        tensor[1, 4:, :] = 0
        tensor[2, 2:, :] = 0
        tensor[3, :, :] = 0
        weights = (tensor != 0.0)[:, :, 0].long().squeeze(-1)
        targets = torch.LongTensor(numpy.random.randint(0, 3, [5, 7]))
        targets *= weights

        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights)

        vector_loss = util.sequence_cross_entropy_with_logits(
            tensor, targets, weights, average=None
        )
        # Batch has one completely padded row, so divide by 4.
        assert loss.data.numpy() == vector_loss.sum().item() / 4

Source File: util_test.py From allennlp with Apache License 2.0

6 votes

def test_sequence_cross_entropy_with_logits_averages_token_correctly(self):
        # test token average is the same as multiplying the per-batch loss
        # with the per-batch weights and dividing by the total weight
        tensor = torch.rand([5, 7, 4])
        tensor[0, 3:, :] = 0
        tensor[1, 4:, :] = 0
        tensor[2, 2:, :] = 0
        tensor[3, :, :] = 0
        weights = (tensor != 0.0)[:, :, 0].long().squeeze(-1)
        targets = torch.LongTensor(numpy.random.randint(0, 3, [5, 7]))
        targets *= weights

        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, average="token")

        vector_loss = util.sequence_cross_entropy_with_logits(
            tensor, targets, weights, average=None
        )
        total_token_loss = (vector_loss * weights.float().sum(dim=-1)).sum()
        average_token_loss = (total_token_loss / weights.float().sum()).detach()
        assert_almost_equal(loss.detach().item(), average_token_loss.item(), decimal=5)

Source File: util_test.py From allennlp with Apache License 2.0

6 votes

def test_sequence_cross_entropy_with_logits_gamma_correctly(self):
        batch = 1
        length = 3
        classes = 4
        gamma = abs(numpy.random.randn())  # [0, +inf)

        tensor = torch.rand([batch, length, classes])
        targets = torch.LongTensor(numpy.random.randint(0, classes, [batch, length]))
        weights = torch.ones([batch, length])

        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, gamma=gamma)

        correct_loss = 0.0
        for logit, label in zip(tensor.squeeze(0), targets.squeeze(0)):
            p = torch.nn.functional.softmax(logit, dim=-1)
            pt = p[label]
            ft = (1 - pt) ** gamma
            correct_loss += -pt.log() * ft
        # Average over sequence.
        correct_loss = correct_loss / length
        numpy.testing.assert_array_almost_equal(loss.data.numpy(), correct_loss.data.numpy())

Source File: util_test.py From allennlp with Apache License 2.0

6 votes

def test_sequence_cross_entropy_with_logits_alpha_list_correctly(self):
        batch = 1
        length = 3
        classes = 4  # alpha float for binary class only
        alpha = abs(numpy.random.randn(classes))  # [0, +inf)

        tensor = torch.rand([batch, length, classes])
        targets = torch.LongTensor(numpy.random.randint(0, classes, [batch, length]))
        weights = torch.ones([batch, length])

        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, alpha=alpha)

        correct_loss = 0.0
        for logit, label in zip(tensor.squeeze(0), targets.squeeze(0)):
            logp = torch.nn.functional.log_softmax(logit, dim=-1)
            logpt = logp[label]
            at = alpha[label]
            correct_loss += -logpt * at
        # Average over sequence.
        correct_loss = correct_loss / length
        numpy.testing.assert_array_almost_equal(loss.data.numpy(), correct_loss.data.numpy())

Source File: tag_decoder.py From udify with MIT License

6 votes

def _loss(self, hidden, mask, gold_tags, output_dim):
        logits = self.task_output(hidden)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(output_dim)

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if gold_tags is not None:
            output_dict["loss"] = sequence_cross_entropy_with_logits(logits,
                                                                     gold_tags,
                                                                     mask,
                                                                     label_smoothing=self.label_smoothing)
            for metric in self.metrics.values():
                metric(logits, gold_tags, mask.float())

        return output_dict

Source File: lstm.py From allennlp_tutorial with MIT License

6 votes

def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(tokens)

        embedded = self._embedder(tokens)
        encoded = self._encoder(embedded, mask)
        classified = self._classifier(encoded)

        output: Dict[str, torch.Tensor] = {}
        output['logits'] = classified

        if label is not None:
            self._f1(classified, label, mask)
            output['loss'] = sequence_cross_entropy_with_logits(classified, label, mask)

        return output

Source File: util_test.py From allennlp with Apache License 2.0

6 votes

def test_sequence_cross_entropy_with_logits_masks_loss_correctly(self):

        # test weight masking by checking that a tensor with non-zero values in
        # masked positions returns the same loss as a tensor with zeros in those
        # positions.
        tensor = torch.rand([5, 7, 4])
        tensor[0, 3:, :] = 0
        tensor[1, 4:, :] = 0
        tensor[2, 2:, :] = 0
        tensor[3, :, :] = 0
        weights = (tensor != 0.0)[:, :, 0].long().squeeze(-1)
        tensor2 = tensor.clone()
        tensor2[0, 3:, :] = 2
        tensor2[1, 4:, :] = 13
        tensor2[2, 2:, :] = 234
        tensor2[3, :, :] = 65
        targets = torch.LongTensor(numpy.random.randint(0, 3, [5, 7]))
        targets *= weights

        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights)
        loss2 = util.sequence_cross_entropy_with_logits(tensor2, targets, weights)
        assert loss.data.numpy() == loss2.data.numpy()

Source File: util_test.py From magnitude with MIT License

6 votes

def test_sequence_cross_entropy_with_logits_masks_loss_correctly(self):

        # test weight masking by checking that a tensor with non-zero values in
        # masked positions returns the same loss as a tensor with zeros in those
        # positions.
        tensor = torch.rand([5, 7, 4])
        tensor[0, 3:, :] = 0
        tensor[1, 4:, :] = 0
        tensor[2, 2:, :] = 0
        tensor[3, :, :] = 0
        weights = (tensor != 0.0)[:, :, 0].long().squeeze(-1)
        tensor2 = tensor.clone()
        tensor2[0, 3:, :] = 2
        tensor2[1, 4:, :] = 13
        tensor2[2, 2:, :] = 234
        tensor2[3, :, :] = 65
        targets = torch.LongTensor(numpy.random.randint(0, 3, [5, 7]))
        targets *= weights

        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights)
        loss2 = util.sequence_cross_entropy_with_logits(tensor2, targets, weights)
        assert loss.data.numpy() == loss2.data.numpy()

Source File: util_test.py From magnitude with MIT License

6 votes

def test_sequence_cross_entropy_with_logits_smooths_labels_correctly(self):
        tensor = torch.rand([1, 3, 4])
        targets = torch.LongTensor(numpy.random.randint(0, 3, [1, 3]))

        weights = torch.ones([2, 3])
        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, label_smoothing=0.1)

        correct_loss = 0.0
        for prediction, label in izip(tensor.squeeze(0), targets.squeeze(0)):
            prediction = torch.nn.functional.log_softmax(prediction, dim=-1)
            correct_loss += prediction[label] * 0.9
            # incorrect elements
            correct_loss += prediction.sum() * 0.1/4
        # Average over sequence.
        correct_loss = - correct_loss / 3
        numpy.testing.assert_array_almost_equal(loss.data.numpy(), correct_loss.data.numpy())

Source File: simple_seq2seq.py From magnitude with MIT License

5 votes

def _get_loss(logits                  ,
                  targets                  ,
                  target_mask                  )                    :
        u"""
        Takes logits (unnormalized outputs from the decoder) of size (batch_size,
        num_decoding_steps, num_classes), target indices of size (batch_size, num_decoding_steps+1)
        and corresponding masks of size (batch_size, num_decoding_steps+1) steps and computes cross
        entropy loss while taking the mask into account.

        The length of ``targets`` is expected to be greater than that of ``logits`` because the
        decoder does not need to compute the output corresponding to the last timestep of
        ``targets``. This method aligns the inputs appropriately to compute the loss.

        During training, we want the logit corresponding to timestep i to be similar to the target
        token from timestep i + 1. That is, the targets should be shifted by one timestep for
        appropriate comparison.  Consider a single example where the target has 3 words, and
        padding is to 7 tokens.
           The complete sequence would correspond to <S> w1  w2  w3  <E> <P> <P>
           and the mask would be                     1   1   1   1   1   0   0
           and let the logits be                     l1  l2  l3  l4  l5  l6
        We actually need to compare:
           the sequence           w1  w2  w3  <E> <P> <P>
           with masks             1   1   1   1   0   0
           against                l1  l2  l3  l4  l5  l6
           (where the input was)  <S> w1  w2  w3  <E> <P>
        """
        relevant_targets = targets[:, 1:].contiguous()  # (batch_size, num_decoding_steps)
        relevant_mask = target_mask[:, 1:].contiguous()  # (batch_size, num_decoding_steps)
        loss = sequence_cross_entropy_with_logits(logits, relevant_targets, relevant_mask)
        return loss

    #overrides

Source File: custom_autoregressive_seq2seq_decoder.py From summarus with Apache License 2.0

5 votes

def _get_loss(self,
                  logits: torch.LongTensor,
                  targets: torch.LongTensor,
                  target_mask: torch.LongTensor) -> torch.Tensor:
        # shape: (batch_size, num_decoding_steps)
        relevant_targets = targets[:, 1:].contiguous()

        # shape: (batch_size, num_decoding_steps)
        relevant_mask = target_mask[:, 1:].contiguous()

        return util.sequence_cross_entropy_with_logits(logits,
                                                       relevant_targets,
                                                       relevant_mask,
                                                       label_smoothing=self._label_smoothing_ratio)

Source File: lstm_character.py From allennlp_tutorial with MIT License

5 votes

def forward(self,
                tokens: Dict[str, torch.Tensor],
                label: torch.Tensor) -> Dict[str, torch.Tensor]:
        # split the namespace into characters and tokens, since they
        # aren't the same shape
        characters = { 'characters': tokens['characters'] }
        tokens = { 'tokens': tokens['tokens'] }

        # get the tokens mask
        mask = get_text_field_mask(tokens)
        # get the cahracters mask, for which we use the nifty `num_wrapping_dims` argument
        character_mask = get_text_field_mask(characters, num_wrapping_dims=1)
        # decompose the shape into named parameters for future use
        batch_size, sequence_length, word_length = character_mask.shape
        # embed the characters
        embedded_characters = self._character_embedder(characters)
        # convert the embeddings from 4d embeddings to a 3d tensor
        # the first dimension of this tensor is (batch_size * num_tokens)
        # (i.e. each word is its own instance in a batch)
        embedded_characters = embedded_characters.view(batch_size*sequence_length, word_length, -1)
        character_mask = character_mask.view(batch_size*sequence_length, word_length)
        # run the character LSTM
        encoded_characters = self._character_encoder(embedded_characters, character_mask)
        # reshape the output into a 3d tensor we can concatenate with the word embeddings
        encoded_characters = encoded_characters.view(batch_size, sequence_length, -1)

        # run the standard LSTM NER pipeline
        embedded = self._word_embedder(tokens)
        embedded = torch.cat([embedded, encoded_characters], dim=2)
        encoded = self._encoder(embedded, mask)

        classified = self._classifier(encoded)

        if label is not None:
            self._f1(classified, label, mask)
            output["loss"] = sequence_cross_entropy_with_logits(classified, label, mask)


        return output

Source File: updown_captioner.py From updown-baseline with MIT License

5 votes

def _get_loss(
        self, logits: torch.Tensor, targets: torch.Tensor, target_mask: torch.Tensor
    ) -> torch.Tensor:
        r"""
        Compute cross entropy loss of predicted caption (logits) w.r.t. target caption. The cross
        entropy loss of caption is cross entropy loss at each time-step, summed.

        Parameters
        ----------
        logits: torch.Tensor
            A tensor of shape ``(batch_size, max_caption_length - 1, vocab_size)`` containing
            unnormalized log-probabilities of predicted captions.
        targets: torch.Tensor
            A tensor of shape ``(batch_size, max_caption_length - 1)`` of tokenized target
            captions.
        target_mask: torch.Tensor
            A mask over target captions, elements where mask is zero are ignored from loss
            computation. Here, we ignore ``@@UNKNOWN@@`` token (and hence padding tokens too
            because they are basically the same).

        Returns
        -------
        torch.Tensor
            A tensor of shape ``(batch_size, )`` containing cross entropy loss of captions, summed
            across time-steps.
        """

        # shape: (batch_size, )
        target_lengths = torch.sum(target_mask, dim=-1).float()

        # shape: (batch_size, )
        return target_lengths * sequence_cross_entropy_with_logits(
            logits, targets, target_mask, average=None
        )

Source File: simple_seq2seq_test.py From magnitude with MIT License

5 votes

def test_loss_is_computed_correctly(self):
        batch_size = 5
        num_decoding_steps = 5
        num_classes = 10
        sample_logits = torch.randn(batch_size, num_decoding_steps-1, num_classes)
        sample_targets = torch.from_numpy(numpy.random.randint(0, num_classes,
                                                               (batch_size, num_decoding_steps)))
        # Mask should be either 0 or 1
        sample_mask = torch.from_numpy(numpy.random.randint(0, 2,
                                                            (batch_size, num_decoding_steps)))
        expected_loss = sequence_cross_entropy_with_logits(sample_logits, sample_targets[:, 1:].contiguous(),
                                                           sample_mask[:, 1:].contiguous())
        # pylint: disable=protected-access
        actual_loss = self.model._get_loss(sample_logits, sample_targets, sample_mask)
        assert numpy.equal(expected_loss.data.numpy(), actual_loss.data.numpy())

Source File: tag_decoder.py From udify with MIT License

5 votes

def _features_loss(self, hidden, mask, gold_tags, output_dict):
        if gold_tags is None:
            return

        for feature in self.features:
            logits = self.feature_outputs[feature](hidden)
            loss = sequence_cross_entropy_with_logits(logits,
                                                      gold_tags[feature],
                                                      mask,
                                                      label_smoothing=self.label_smoothing)
            loss /= len(self.features)
            output_dict["loss"] += loss

            for metric in self.features_metrics[feature].values():
                metric(logits, gold_tags[feature], mask.float())

Source File: util_test.py From allennlp with Apache License 2.0

5 votes

def test_sequence_cross_entropy_with_logits_alpha_single_float_correctly(self):
        batch = 1
        length = 3
        classes = 2  # alpha float for binary class only
        alpha = (
            numpy.random.rand() if numpy.random.rand() > 0.5 else (1.0 - numpy.random.rand())
        )  # [0, 1]
        alpha = torch.tensor(alpha)

        tensor = torch.rand([batch, length, classes])
        targets = torch.LongTensor(numpy.random.randint(0, classes, [batch, length]))
        weights = torch.ones([batch, length])

        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, alpha=alpha)

        correct_loss = 0.0
        for logit, label in zip(tensor.squeeze(0), targets.squeeze(0)):
            logp = torch.nn.functional.log_softmax(logit, dim=-1)
            logpt = logp[label]
            if label:
                at = alpha
            else:
                at = 1 - alpha
            correct_loss += -logpt * at
        # Average over sequence.
        correct_loss = correct_loss / length
        numpy.testing.assert_array_almost_equal(loss.data.numpy(), correct_loss.data.numpy())

Source File: simple_tagger.py From magnitude with MIT License

4 votes

def forward(self,  # type: ignore
                tokens                             ,
                tags                   = None,
                metadata                       = None)                           :
        # pylint: disable=arguments-differ
        u"""
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containg the original words in the sentence to be tagged under a 'words' key.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """
        embedded_text_input = self.text_field_embedder(tokens)
        batch_size, sequence_length, _ = embedded_text_input.size()
        mask = get_text_field_mask(tokens)
        encoded_text = self.encoder(embedded_text_input, mask)

        logits = self.tag_projection_layer(encoded_text)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size,
                                                                          sequence_length,
                                                                          self.num_classes])

        output_dict = {u"logits": logits, u"class_probabilities": class_probabilities}

        if tags is not None:
            loss = sequence_cross_entropy_with_logits(logits, tags, mask)
            for metric in list(self.metrics.values()):
                metric(logits, tags, mask.float())
            output_dict[u"loss"] = loss

        if metadata is not None:
            output_dict[u"words"] = [x[u"words"] for x in metadata]
        return output_dict

    #overrides

Source File: simple_bilm.py From swagaf with MIT License

4 votes

def forward(self, words: torch.Tensor, use_forward=True, use_reverse=True, compute_logprobs=False) -> Dict[
        str, Union[torch.Tensor, List[torch.Tensor]]]:
        """
        use this for training the LM
        :param words: [batch_size, N] words. assuming you're starting with BOS and ending with EOS here
        :return:
        """
        encoded_inputs = self.embed_words(words)
        mask = (words != 0).long()[:, 2:]
        word_targets = words[:, 1:-1].contiguous()

        result_dict = {
            'mask': mask,
            'word_targets': word_targets,
        }
        # TODO: try to reduce duplicate code here
        if use_forward:
            self.forward_lm.reset_states()
            forward_activation = self.forward_lm(encoded_inputs[:, :-2], mask)

            if compute_logprobs:
                # being memory efficient here is critical if the input tensors are large
                result_dict['forward_logprobs'] = self._chunked_logsoftmaxes(forward_activation,
                                                                             word_targets) * mask.float()
            else:

                result_dict['forward_logits'] = self.decoder(forward_activation)
                result_dict['forward_loss'] = sequence_cross_entropy_with_logits(result_dict['forward_logits'],
                                                                                 word_targets,
                                                                                 mask)
        if use_reverse:
            self.reverse_lm.reset_states()
            reverse_activation = self.reverse_lm(encoded_inputs[:, 2:], mask)
            if compute_logprobs:
                result_dict['reverse_logprobs'] = self._chunked_logsoftmaxes(reverse_activation,
                                                                             word_targets) * mask.float()
            else:
                result_dict['reverse_logits'] = self.decoder(reverse_activation)
                result_dict['reverse_loss'] = sequence_cross_entropy_with_logits(result_dict['reverse_logits'],
                                                                                 word_targets,
                                                                                 mask)
        return result_dict

Source File: simple_tagger.py From HIT-SCIR-CoNLL2019 with Apache License 2.0

4 votes

def forward(self,  # type: ignore
                tokens: Dict[str, torch.LongTensor],
                tags: torch.LongTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containing the original words in the sentence to be tagged under a 'words' key.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """
        embedded_text_input = self.text_field_embedder(tokens)
        batch_size, sequence_length, _ = embedded_text_input.size()
        mask = get_text_field_mask(tokens)
        encoded_text = self.encoder(embedded_text_input, mask)

        logits = self.tag_projection_layer(encoded_text)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size,
                                                                          sequence_length,
                                                                          self.num_classes])

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if tags is not None:
            loss = sequence_cross_entropy_with_logits(logits, tags, mask)
            for metric in self.metrics.values():
                metric(logits, tags, mask.float())
            if self._f1_metric is not None:
                self._f1_metric(logits, tags, mask.float())
            output_dict["loss"] = loss

        if metadata is not None:
            output_dict["words"] = [x["words"] for x in metadata]
        return output_dict

Source File: seq2seq_base.py From probnmn-clevr with MIT License

4 votes

def _get_loss(
        logits: torch.LongTensor, targets: torch.LongTensor, target_mask: torch.LongTensor
    ):
        r"""
        Override AllenNLP Seq2Seq model's provided ``_get_loss`` method, which returns sequence
        cross entropy averaged over batch by default. Instead, provide sequence cross entropy of
        each sequence in a batch separately.

        Extended Summary
        ----------------
        From AllenNLP documentation:

        Compute loss.
        Takes logits (unnormalized outputs from the decoder) of size (batch_size,
        num_decoding_steps, num_classes), target indices of size (batch_size, num_decoding_steps+1)
        and corresponding masks of size (batch_size, num_decoding_steps+1) steps and computes
        cross entropy loss while taking the mask into account.
        The length of ``targets`` is expected to be greater than that of ``logits`` because the
        decoder does not need to compute the output corresponding to the last timestep of
        ``targets``. This method aligns the inputs appropriately to compute the loss.
        During training, we want the logit corresponding to timestep i to be similar to the target
        token from timestep i + 1. That is, the targets should be shifted by one timestep for
        appropriate comparison.  Consider a single example where the target has 3 words, and
        padding is to 7 tokens::

           The complete sequence would correspond to <S> w1  w2  w3  <E> <P> <P>
           and the mask would be                     1   1   1   1   1   0   0
           and let the logits be                     l1  l2  l3  l4  l5  l6

        We actually need to compare::

           the sequence           w1  w2  w3  <E> <P> <P>
           with masks             1   1   1   1   0   0
           against                l1  l2  l3  l4  l5  l6
           (where the input was)  <S> w1  w2  w3  <E> <P>

        """
        # shape: (batch_size, num_decoding_steps)
        relevant_targets = targets[:, 1:].contiguous()

        # shape: (batch_size, num_decoding_steps)
        relevant_mask = target_mask[:, 1:].contiguous()

        return sequence_cross_entropy_with_logits(
            logits, relevant_targets, relevant_mask, average=None
        )

Python allennlp.nn.util.sequence_cross_entropy_with_logits() Examples