Python Examples of allennlp.training.metrics.SpanBasedF1Measure

Source File: lstm_crf.py From allennlp_tutorial with MIT License

6 votes

def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels')
        )
        self._crf = ConditionalRandomField(
            vocab.get_vocab_size('labels')
        )

        self._f1 = SpanBasedF1Measure(vocab, 'labels')

Source File: crf_tagger.py From didyprog with MIT License

5 votes

def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 include_start_end_transitions: bool = True,
                 dropout: float = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type or "BIO")

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        initializer(self)

Source File: span_based_f1_measure_test.py From allennlp with Apache License 2.0

5 votes

def test_bmes_span_metrics_are_computed_correctly(self, device: str):
        # (bmes_tags) B:0, M:1, E:2, S:3.
        # [S, B, M, E, S]
        # [S, S, S, S, S]
        gold_indices = [[3, 0, 1, 2, 3], [3, 3, 3, 3, 3]]
        gold_tensor = torch.tensor(gold_indices, device=device)

        prediction_tensor = torch.rand([2, 5, 4], device=device)
        # [S, B, E, S, S]
        # TP: 2, FP: 2, FN: 1.
        prediction_tensor[0, 0, 3] = 1  # (True positive)
        prediction_tensor[0, 1, 0] = 1  # (False positive
        prediction_tensor[0, 2, 2] = 1  # *)
        prediction_tensor[0, 3, 3] = 1  # (False positive)
        prediction_tensor[0, 4, 3] = 1  # (True positive)
        # [B, E, S, B, E]
        # TP: 1, FP: 2, FN: 4.
        prediction_tensor[1, 0, 0] = 1  # (False positive
        prediction_tensor[1, 1, 2] = 1  # *)
        prediction_tensor[1, 2, 3] = 1  # (True positive)
        prediction_tensor[1, 3, 0] = 1  # (False positive
        prediction_tensor[1, 4, 2] = 1  # *)

        metric = SpanBasedF1Measure(self.vocab, "bmes_tags", label_encoding="BMES")
        metric(prediction_tensor, gold_tensor)

        # TP: 3, FP: 4, FN: 5.
        metric_dict = metric.get_metric()

        assert_allclose(metric_dict["recall-overall"], 0.375, rtol=0.001, atol=1e-03)
        assert_allclose(metric_dict["precision-overall"], 0.428, rtol=0.001, atol=1e-03)
        assert_allclose(metric_dict["f1-measure-overall"], 0.4, rtol=0.001, atol=1e-03)

Source File: span_based_f1_measure_test.py From allennlp with Apache License 2.0

5 votes

def test_span_f1_accepts_tags_to_spans_function_argument(self, device: str):
        def mock_tags_to_spans_function(tag_sequence, classes_to_ignore=None):
            return [("mock", (42, 42))]

        # Should be ignore.
        bio_tags = ["B-ARG1", "O", "B-C-ARG1", "B-V", "B-ARGM-ADJ", "O"]
        gold_indices = [self.vocab.get_token_index(x, "tags") for x in bio_tags]
        gold_tensor = torch.tensor([gold_indices], device=device)
        prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")], device=device)

        metric = SpanBasedF1Measure(
            self.vocab,
            "tags",
            label_encoding=None,
            tags_to_spans_function=mock_tags_to_spans_function,
        )

        metric(prediction_tensor, gold_tensor)
        metric_dict = metric.get_metric()

        assert_allclose(metric_dict["recall-overall"], 1.0)
        assert_allclose(metric_dict["precision-overall"], 1.0)
        assert_allclose(metric_dict["f1-measure-overall"], 1.0)

        with pytest.raises(ConfigurationError):
            SpanBasedF1Measure(self.vocab, label_encoding="INVALID")
        with pytest.raises(ConfigurationError):
            SpanBasedF1Measure(self.vocab, tags_to_spans_function=mock_tags_to_spans_function)
        with pytest.raises(ConfigurationError):
            SpanBasedF1Measure(self.vocab, label_encoding=None, tags_to_spans_function=None)

Source File: semantic_role_labeler.py From magnitude with MIT License

5 votes

def __init__(self, vocab            ,
                 text_field_embedder                   ,
                 encoder                ,
                 binary_feature_dim     ,
                 embedding_dropout        = 0.0,
                 initializer                        = InitializerApplicator(),
                 regularizer                                  = None,
                 label_smoothing        = None)        :
        super(SemanticRoleLabeler, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size(u"labels")

        # For the span based evaluation, we don't want to consider labels
        # for verb, because the verb index is provided to the model.
        self.span_metric = SpanBasedF1Measure(vocab, tag_namespace=u"labels", ignore_classes=[u"V"])

        self.encoder = encoder
        # There are exactly 2 binary features for the verb predicate embedding.
        self.binary_feature_embedding = Embedding(2, binary_feature_dim)
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_classes))
        self.embedding_dropout = Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing

        check_dimensions_match(text_field_embedder.get_output_dim() + binary_feature_dim,
                               encoder.get_input_dim(),
                               u"text embedding dim + verb indicator embedding dim",
                               u"encoder input dim")
        initializer(self)

Source File: prolocal_model.py From propara with Apache License 2.0

5 votes

def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 seq2seq_encoder: Seq2SeqEncoder,
                 initializer: InitializerApplicator) -> None:
        super(ProLocalModel, self).__init__(vocab)

        self.text_field_embedder = text_field_embedder
        self.seq2seq_encoder = seq2seq_encoder

        self.attention_layer = \
            Attention(similarity_function=BilinearSimilarity(2 * seq2seq_encoder.get_output_dim(),
                                                             seq2seq_encoder.get_output_dim()), normalize=True)

        self.num_types = self.vocab.get_vocab_size("state_change_type_labels")
        self.aggregate_feedforward = Linear(seq2seq_encoder.get_output_dim(),
                                            self.num_types)

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace="state_change_tags")  # by default "O" is ignored in metric computation
        self.num_tags = self.vocab.get_vocab_size("state_change_tags")

        self.tag_projection_layer = TimeDistributed(Linear(self.seq2seq_encoder.get_output_dim() + 2
                                                           , self.num_tags))
        self._type_accuracy = CategoricalAccuracy()

        self.type_f1_metrics = {}
        self.type_labels_vocab = self.vocab.get_index_to_token_vocabulary("state_change_type_labels")
        for type_label in self.type_labels_vocab.values():
            self.type_f1_metrics["type_" + type_label] = F1Measure(self.vocab.get_token_index(type_label, "state_change_type_labels"))

        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)

Source File: lstm.py From allennlp_tutorial with MIT License

5 votes

def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                           out_features=vocab.get_vocab_size('labels'))

        self._f1 = SpanBasedF1Measure(vocab, 'labels', 'IOB1')

Source File: simple_tagger.py From HIT-SCIR-CoNLL2019 with Apache License 2.0

5 votes

def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 calculate_span_f1: bool = None,
                 label_encoding: Optional[str] = None,
                 label_namespace: str = "labels",
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SimpleTagger, self).__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_classes))

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")

        # We keep calculate_span_f1 as a constructor argument for API consistency with
        # the CrfTagger, even it is redundant in this class
        # (label_encoding serves the same purpose).
        if calculate_span_f1 and not label_encoding:
            raise ConfigurationError("calculate_span_f1 is True, but "
                                     "no label_encoding was specified.")
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }

        if calculate_span_f1 or label_encoding:
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=label_encoding)
        else:
            self._f1_metric = None

        initializer(self)

Source File: simple_tagger.py From allennlp with Apache License 2.0

4 votes

def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        calculate_span_f1: bool = None,
        label_encoding: Optional[str] = None,
        label_namespace: str = "labels",
        verbose_metrics: bool = False,
        initializer: InitializerApplicator = InitializerApplicator(),
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_classes)
        )

        check_dimensions_match(
            text_field_embedder.get_output_dim(),
            encoder.get_input_dim(),
            "text field embedding dim",
            "encoder input dim",
        )

        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3),
        }

        # We keep calculate_span_f1 as a constructor argument for API consistency with
        # the CrfTagger, even it is redundant in this class
        # (label_encoding serves the same purpose).
        if calculate_span_f1 is None:
            calculate_span_f1 = label_encoding is not None

        self.calculate_span_f1 = calculate_span_f1
        if calculate_span_f1:
            if not label_encoding:
                raise ConfigurationError(
                    "calculate_span_f1 is True, but no label_encoding was specified."
                )
            self._f1_metric = SpanBasedF1Measure(
                vocab, tag_namespace=label_namespace, label_encoding=label_encoding
            )
        else:
            self._f1_metric = None

        initializer(self)

Source File: crf_tagger.py From magnitude with MIT License

4 votes

def __init__(self, vocab            ,
                 text_field_embedder                   ,
                 encoder                ,
                 label_namespace      = u"labels",
                 constraint_type      = None,
                 feedforward              = None,
                 include_start_end_transitions       = True,
                 dropout        = None,
                 verbose_metrics       = False,
                 initializer                        = InitializerApplicator(),
                 regularizer                                  = None)        :
        super(CrfTagger, self).__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type or u"BIO")


        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               u"text field embedding dim", u"encoder input dim")
        if feedforward is not None:
            check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(),
                                   u"encoder output dim", u"feedforward input dim")
        initializer(self)

    #overrides

Source File: span_based_f1_measure_test.py From magnitude with MIT License

4 votes

def test_span_metrics_are_computed_correctly(self):
        gold_labels = [u"O", u"B-ARG1", u"I-ARG1", u"O", u"B-ARG2", u"I-ARG2", u"O", u"O", u"O"]
        gold_indices = [self.vocab.get_token_index(x, u"tags") for x in gold_labels]

        gold_tensor = torch.Tensor([gold_indices])

        prediction_tensor = torch.rand([2, 9, self.vocab.get_vocab_size(u"tags")])

        # Test that the span measure ignores completely masked sequences by
        # passing a mask with a fully masked row.
        mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
                                 [0, 0, 0, 0, 0, 0, 0, 0, 0]])

        prediction_tensor[:, 0, 0] = 1
        prediction_tensor[:, 1, 1] = 1  # (True positive - ARG1
        prediction_tensor[:, 2, 2] = 1  # *)
        prediction_tensor[:, 3, 0] = 1
        prediction_tensor[:, 4, 0] = 1  # (False Negative - ARG2
        prediction_tensor[:, 5, 0] = 1  # *)
        prediction_tensor[:, 6, 0] = 1
        prediction_tensor[:, 7, 1] = 1  # (False Positive - ARG1
        prediction_tensor[:, 8, 2] = 1  # *)

        metric = SpanBasedF1Measure(self.vocab, u"tags")
        metric(prediction_tensor, gold_tensor, mask)

        assert metric._true_positives[u"ARG1"] == 1
        assert metric._true_positives[u"ARG2"] == 0
        assert u"O" not in list(metric._true_positives.keys())
        assert metric._false_negatives[u"ARG1"] == 0
        assert metric._false_negatives[u"ARG2"] == 1
        assert u"O" not in list(metric._false_negatives.keys())
        assert metric._false_positives[u"ARG1"] == 1
        assert metric._false_positives[u"ARG2"] == 0
        assert u"O" not in list(metric._false_positives.keys())

        # Check things are accumulating correctly.
        metric(prediction_tensor, gold_tensor, mask)
        assert metric._true_positives[u"ARG1"] == 2
        assert metric._true_positives[u"ARG2"] == 0
        assert u"O" not in list(metric._true_positives.keys())
        assert metric._false_negatives[u"ARG1"] == 0
        assert metric._false_negatives[u"ARG2"] == 2
        assert u"O" not in list(metric._false_negatives.keys())
        assert metric._false_positives[u"ARG1"] == 2
        assert metric._false_positives[u"ARG2"] == 0
        assert u"O" not in list(metric._false_positives.keys())

        metric_dict = metric.get_metric()

        numpy.testing.assert_almost_equal(metric_dict[u"recall-ARG2"], 0.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-ARG2"], 0.0)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARG2"], 0.0)
        numpy.testing.assert_almost_equal(metric_dict[u"recall-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-ARG1"], 0.5)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARG1"], 0.666666666)
        numpy.testing.assert_almost_equal(metric_dict[u"recall-overall"], 0.5)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-overall"], 0.5)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-overall"], 0.5)

Source File: span_based_f1_measure_test.py From magnitude with MIT License

4 votes

def test_span_f1_matches_perl_script_for_continued_arguments(self):
        bio_tags = [u"B-ARG1", u"O", u"B-C-ARG1", u"B-V", u"B-ARGM-ADJ", u"O"]
        sentence = [u"Mark", u"and", u"Matt", u"were", u"running", u"fast", u"."]

        gold_indices = [self.vocab.get_token_index(x, u"tags") for x in bio_tags]
        gold_tensor = torch.Tensor([gold_indices])
        prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size(u"tags")])
        mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])

        # Make prediction so that it is exactly correct.
        for i, tag_index in enumerate(gold_indices):
            prediction_tensor[0, i, tag_index] = 1

        metric = SpanBasedF1Measure(self.vocab, u"tags")
        metric(prediction_tensor, gold_tensor, mask)
        metric_dict = metric.get_metric()

        # We merged the continued ARG1 label into a single span, so there should
        # be exactly 1 true positive for ARG1 and nothing present for C-ARG1
        assert metric._true_positives[u"ARG1"] == 1
        # The labels containing continuation references get merged into
        # the labels that they continue, so they should never appear in
        # the precision/recall counts.
        assert u"C-ARG1" not in list(metric._true_positives.keys())
        assert metric._true_positives[u"V"] == 1
        assert metric._true_positives[u"ARGM-ADJ"] == 1

        numpy.testing.assert_almost_equal(metric_dict[u"recall-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"recall-V"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-V"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-V"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"recall-ARGM-ADJ"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-ARGM-ADJ"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARGM-ADJ"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"recall-overall"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-overall"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-overall"], 1.0)

        # Check that the number of true positive ARG1 labels is the same as the perl script's output:
        gold_file_path = os.path.join(self.TEST_DIR, u"gold_conll_eval.txt")
        prediction_file_path = os.path.join(self.TEST_DIR, u"prediction_conll_eval.txt")
        with open(gold_file_path, u"a+") as gold_file, open(prediction_file_path, u"a+") as prediction_file:
            # Use the same bio tags as prediction vs gold to make it obvious by looking
            # at the perl script output if something is wrong.
            write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags)
        # Run the official perl script and collect stdout.
        perl_script_command = [u"perl", unicode(self.TOOLS_ROOT / u"srl-eval.pl"), prediction_file_path, gold_file_path]
        stdout = subprocess.check_output(perl_script_command, universal_newlines=True)
        stdout_lines = stdout.split(u"\n")
        # Parse the stdout of the perl script to find the ARG1 row (this happens to be line 8).
        num_correct_arg1_instances_from_perl_evaluation = int([token for token in
                                                               stdout_lines[8].split(u" ") if token][1])
        assert num_correct_arg1_instances_from_perl_evaluation == metric._true_positives[u"ARG1"]

Python allennlp.training.metrics.SpanBasedF1Measure() Examples