Python allennlp.training.metrics.SpanBasedF1Measure() Examples

The following are 12 code examples of allennlp.training.metrics.SpanBasedF1Measure(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.training.metrics , or try the search function .
Example #1
Source File: lstm_crf.py    From allennlp_tutorial with MIT License 6 votes vote down vote up
def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(
            in_features=encoder.get_output_dim(),
            out_features=vocab.get_vocab_size('labels')
        )
        self._crf = ConditionalRandomField(
            vocab.get_vocab_size('labels')
        )

        self._f1 = SpanBasedF1Measure(vocab, 'labels') 
Example #2
Source File: crf_tagger.py    From didyprog with MIT License 5 votes vote down vote up
def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 constraint_type: str = None,
                 include_start_end_transitions: bool = True,
                 dropout: float = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type or "BIO")

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        initializer(self) 
Example #3
Source File: span_based_f1_measure_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_bmes_span_metrics_are_computed_correctly(self, device: str):
        # (bmes_tags) B:0, M:1, E:2, S:3.
        # [S, B, M, E, S]
        # [S, S, S, S, S]
        gold_indices = [[3, 0, 1, 2, 3], [3, 3, 3, 3, 3]]
        gold_tensor = torch.tensor(gold_indices, device=device)

        prediction_tensor = torch.rand([2, 5, 4], device=device)
        # [S, B, E, S, S]
        # TP: 2, FP: 2, FN: 1.
        prediction_tensor[0, 0, 3] = 1  # (True positive)
        prediction_tensor[0, 1, 0] = 1  # (False positive
        prediction_tensor[0, 2, 2] = 1  # *)
        prediction_tensor[0, 3, 3] = 1  # (False positive)
        prediction_tensor[0, 4, 3] = 1  # (True positive)
        # [B, E, S, B, E]
        # TP: 1, FP: 2, FN: 4.
        prediction_tensor[1, 0, 0] = 1  # (False positive
        prediction_tensor[1, 1, 2] = 1  # *)
        prediction_tensor[1, 2, 3] = 1  # (True positive)
        prediction_tensor[1, 3, 0] = 1  # (False positive
        prediction_tensor[1, 4, 2] = 1  # *)

        metric = SpanBasedF1Measure(self.vocab, "bmes_tags", label_encoding="BMES")
        metric(prediction_tensor, gold_tensor)

        # TP: 3, FP: 4, FN: 5.
        metric_dict = metric.get_metric()

        assert_allclose(metric_dict["recall-overall"], 0.375, rtol=0.001, atol=1e-03)
        assert_allclose(metric_dict["precision-overall"], 0.428, rtol=0.001, atol=1e-03)
        assert_allclose(metric_dict["f1-measure-overall"], 0.4, rtol=0.001, atol=1e-03) 
Example #4
Source File: span_based_f1_measure_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_span_f1_accepts_tags_to_spans_function_argument(self, device: str):
        def mock_tags_to_spans_function(tag_sequence, classes_to_ignore=None):
            return [("mock", (42, 42))]

        # Should be ignore.
        bio_tags = ["B-ARG1", "O", "B-C-ARG1", "B-V", "B-ARGM-ADJ", "O"]
        gold_indices = [self.vocab.get_token_index(x, "tags") for x in bio_tags]
        gold_tensor = torch.tensor([gold_indices], device=device)
        prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")], device=device)

        metric = SpanBasedF1Measure(
            self.vocab,
            "tags",
            label_encoding=None,
            tags_to_spans_function=mock_tags_to_spans_function,
        )

        metric(prediction_tensor, gold_tensor)
        metric_dict = metric.get_metric()

        assert_allclose(metric_dict["recall-overall"], 1.0)
        assert_allclose(metric_dict["precision-overall"], 1.0)
        assert_allclose(metric_dict["f1-measure-overall"], 1.0)

        with pytest.raises(ConfigurationError):
            SpanBasedF1Measure(self.vocab, label_encoding="INVALID")
        with pytest.raises(ConfigurationError):
            SpanBasedF1Measure(self.vocab, tags_to_spans_function=mock_tags_to_spans_function)
        with pytest.raises(ConfigurationError):
            SpanBasedF1Measure(self.vocab, label_encoding=None, tags_to_spans_function=None) 
Example #5
Source File: semantic_role_labeler.py    From magnitude with MIT License 5 votes vote down vote up
def __init__(self, vocab            ,
                 text_field_embedder                   ,
                 encoder                ,
                 binary_feature_dim     ,
                 embedding_dropout        = 0.0,
                 initializer                        = InitializerApplicator(),
                 regularizer                                  = None,
                 label_smoothing        = None)        :
        super(SemanticRoleLabeler, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size(u"labels")

        # For the span based evaluation, we don't want to consider labels
        # for verb, because the verb index is provided to the model.
        self.span_metric = SpanBasedF1Measure(vocab, tag_namespace=u"labels", ignore_classes=[u"V"])

        self.encoder = encoder
        # There are exactly 2 binary features for the verb predicate embedding.
        self.binary_feature_embedding = Embedding(2, binary_feature_dim)
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_classes))
        self.embedding_dropout = Dropout(p=embedding_dropout)
        self._label_smoothing = label_smoothing

        check_dimensions_match(text_field_embedder.get_output_dim() + binary_feature_dim,
                               encoder.get_input_dim(),
                               u"text embedding dim + verb indicator embedding dim",
                               u"encoder input dim")
        initializer(self) 
Example #6
Source File: prolocal_model.py    From propara with Apache License 2.0 5 votes vote down vote up
def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 seq2seq_encoder: Seq2SeqEncoder,
                 initializer: InitializerApplicator) -> None:
        super(ProLocalModel, self).__init__(vocab)

        self.text_field_embedder = text_field_embedder
        self.seq2seq_encoder = seq2seq_encoder

        self.attention_layer = \
            Attention(similarity_function=BilinearSimilarity(2 * seq2seq_encoder.get_output_dim(),
                                                             seq2seq_encoder.get_output_dim()), normalize=True)

        self.num_types = self.vocab.get_vocab_size("state_change_type_labels")
        self.aggregate_feedforward = Linear(seq2seq_encoder.get_output_dim(),
                                            self.num_types)

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace="state_change_tags")  # by default "O" is ignored in metric computation
        self.num_tags = self.vocab.get_vocab_size("state_change_tags")

        self.tag_projection_layer = TimeDistributed(Linear(self.seq2seq_encoder.get_output_dim() + 2
                                                           , self.num_tags))
        self._type_accuracy = CategoricalAccuracy()

        self.type_f1_metrics = {}
        self.type_labels_vocab = self.vocab.get_index_to_token_vocabulary("state_change_type_labels")
        for type_label in self.type_labels_vocab.values():
            self.type_f1_metrics["type_" + type_label] = F1Measure(self.vocab.get_token_index(type_label, "state_change_type_labels"))

        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self) 
Example #7
Source File: lstm.py    From allennlp_tutorial with MIT License 5 votes vote down vote up
def __init__(self,
                 vocab: Vocabulary,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder) -> None:
        super().__init__(vocab)

        self._embedder = embedder
        self._encoder = encoder
        self._classifier = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                           out_features=vocab.get_vocab_size('labels'))

        self._f1 = SpanBasedF1Measure(vocab, 'labels', 'IOB1') 
Example #8
Source File: simple_tagger.py    From HIT-SCIR-CoNLL2019 with Apache License 2.0 5 votes vote down vote up
def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 calculate_span_f1: bool = None,
                 label_encoding: Optional[str] = None,
                 label_namespace: str = "labels",
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SimpleTagger, self).__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
                                                           self.num_classes))

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")

        # We keep calculate_span_f1 as a constructor argument for API consistency with
        # the CrfTagger, even it is redundant in this class
        # (label_encoding serves the same purpose).
        if calculate_span_f1 and not label_encoding:
            raise ConfigurationError("calculate_span_f1 is True, but "
                                     "no label_encoding was specified.")
        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3)
        }

        if calculate_span_f1 or label_encoding:
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=label_encoding)
        else:
            self._f1_metric = None

        initializer(self) 
Example #9
Source File: simple_tagger.py    From allennlp with Apache License 2.0 4 votes vote down vote up
def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        calculate_span_f1: bool = None,
        label_encoding: Optional[str] = None,
        label_namespace: str = "labels",
        verbose_metrics: bool = False,
        initializer: InitializerApplicator = InitializerApplicator(),
        **kwargs,
    ) -> None:
        super().__init__(vocab, **kwargs)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_classes = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        self.tag_projection_layer = TimeDistributed(
            Linear(self.encoder.get_output_dim(), self.num_classes)
        )

        check_dimensions_match(
            text_field_embedder.get_output_dim(),
            encoder.get_input_dim(),
            "text field embedding dim",
            "encoder input dim",
        )

        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "accuracy3": CategoricalAccuracy(top_k=3),
        }

        # We keep calculate_span_f1 as a constructor argument for API consistency with
        # the CrfTagger, even it is redundant in this class
        # (label_encoding serves the same purpose).
        if calculate_span_f1 is None:
            calculate_span_f1 = label_encoding is not None

        self.calculate_span_f1 = calculate_span_f1
        if calculate_span_f1:
            if not label_encoding:
                raise ConfigurationError(
                    "calculate_span_f1 is True, but no label_encoding was specified."
                )
            self._f1_metric = SpanBasedF1Measure(
                vocab, tag_namespace=label_namespace, label_encoding=label_encoding
            )
        else:
            self._f1_metric = None

        initializer(self) 
Example #10
Source File: crf_tagger.py    From magnitude with MIT License 4 votes vote down vote up
def __init__(self, vocab            ,
                 text_field_embedder                   ,
                 encoder                ,
                 label_namespace      = u"labels",
                 constraint_type      = None,
                 feedforward              = None,
                 include_start_end_transitions       = True,
                 dropout        = None,
                 verbose_metrics       = False,
                 initializer                        = InitializerApplicator(),
                 regularizer                                  = None)        :
        super(CrfTagger, self).__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        if constraint_type is not None:
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(constraint_type, labels)
        else:
            constraints = None

        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.span_metric = SpanBasedF1Measure(vocab,
                                              tag_namespace=label_namespace,
                                              label_encoding=constraint_type or u"BIO")


        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               u"text field embedding dim", u"encoder input dim")
        if feedforward is not None:
            check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(),
                                   u"encoder output dim", u"feedforward input dim")
        initializer(self)

    #overrides 
Example #11
Source File: span_based_f1_measure_test.py    From magnitude with MIT License 4 votes vote down vote up
def test_span_metrics_are_computed_correctly(self):
        gold_labels = [u"O", u"B-ARG1", u"I-ARG1", u"O", u"B-ARG2", u"I-ARG2", u"O", u"O", u"O"]
        gold_indices = [self.vocab.get_token_index(x, u"tags") for x in gold_labels]

        gold_tensor = torch.Tensor([gold_indices])

        prediction_tensor = torch.rand([2, 9, self.vocab.get_vocab_size(u"tags")])

        # Test that the span measure ignores completely masked sequences by
        # passing a mask with a fully masked row.
        mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
                                 [0, 0, 0, 0, 0, 0, 0, 0, 0]])

        prediction_tensor[:, 0, 0] = 1
        prediction_tensor[:, 1, 1] = 1  # (True positive - ARG1
        prediction_tensor[:, 2, 2] = 1  # *)
        prediction_tensor[:, 3, 0] = 1
        prediction_tensor[:, 4, 0] = 1  # (False Negative - ARG2
        prediction_tensor[:, 5, 0] = 1  # *)
        prediction_tensor[:, 6, 0] = 1
        prediction_tensor[:, 7, 1] = 1  # (False Positive - ARG1
        prediction_tensor[:, 8, 2] = 1  # *)

        metric = SpanBasedF1Measure(self.vocab, u"tags")
        metric(prediction_tensor, gold_tensor, mask)

        assert metric._true_positives[u"ARG1"] == 1
        assert metric._true_positives[u"ARG2"] == 0
        assert u"O" not in list(metric._true_positives.keys())
        assert metric._false_negatives[u"ARG1"] == 0
        assert metric._false_negatives[u"ARG2"] == 1
        assert u"O" not in list(metric._false_negatives.keys())
        assert metric._false_positives[u"ARG1"] == 1
        assert metric._false_positives[u"ARG2"] == 0
        assert u"O" not in list(metric._false_positives.keys())

        # Check things are accumulating correctly.
        metric(prediction_tensor, gold_tensor, mask)
        assert metric._true_positives[u"ARG1"] == 2
        assert metric._true_positives[u"ARG2"] == 0
        assert u"O" not in list(metric._true_positives.keys())
        assert metric._false_negatives[u"ARG1"] == 0
        assert metric._false_negatives[u"ARG2"] == 2
        assert u"O" not in list(metric._false_negatives.keys())
        assert metric._false_positives[u"ARG1"] == 2
        assert metric._false_positives[u"ARG2"] == 0
        assert u"O" not in list(metric._false_positives.keys())

        metric_dict = metric.get_metric()

        numpy.testing.assert_almost_equal(metric_dict[u"recall-ARG2"], 0.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-ARG2"], 0.0)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARG2"], 0.0)
        numpy.testing.assert_almost_equal(metric_dict[u"recall-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-ARG1"], 0.5)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARG1"], 0.666666666)
        numpy.testing.assert_almost_equal(metric_dict[u"recall-overall"], 0.5)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-overall"], 0.5)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-overall"], 0.5) 
Example #12
Source File: span_based_f1_measure_test.py    From magnitude with MIT License 4 votes vote down vote up
def test_span_f1_matches_perl_script_for_continued_arguments(self):
        bio_tags = [u"B-ARG1", u"O", u"B-C-ARG1", u"B-V", u"B-ARGM-ADJ", u"O"]
        sentence = [u"Mark", u"and", u"Matt", u"were", u"running", u"fast", u"."]

        gold_indices = [self.vocab.get_token_index(x, u"tags") for x in bio_tags]
        gold_tensor = torch.Tensor([gold_indices])
        prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size(u"tags")])
        mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])

        # Make prediction so that it is exactly correct.
        for i, tag_index in enumerate(gold_indices):
            prediction_tensor[0, i, tag_index] = 1

        metric = SpanBasedF1Measure(self.vocab, u"tags")
        metric(prediction_tensor, gold_tensor, mask)
        metric_dict = metric.get_metric()

        # We merged the continued ARG1 label into a single span, so there should
        # be exactly 1 true positive for ARG1 and nothing present for C-ARG1
        assert metric._true_positives[u"ARG1"] == 1
        # The labels containing continuation references get merged into
        # the labels that they continue, so they should never appear in
        # the precision/recall counts.
        assert u"C-ARG1" not in list(metric._true_positives.keys())
        assert metric._true_positives[u"V"] == 1
        assert metric._true_positives[u"ARGM-ADJ"] == 1

        numpy.testing.assert_almost_equal(metric_dict[u"recall-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARG1"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"recall-V"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-V"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-V"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"recall-ARGM-ADJ"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-ARGM-ADJ"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARGM-ADJ"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"recall-overall"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"precision-overall"], 1.0)
        numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-overall"], 1.0)

        # Check that the number of true positive ARG1 labels is the same as the perl script's output:
        gold_file_path = os.path.join(self.TEST_DIR, u"gold_conll_eval.txt")
        prediction_file_path = os.path.join(self.TEST_DIR, u"prediction_conll_eval.txt")
        with open(gold_file_path, u"a+") as gold_file, open(prediction_file_path, u"a+") as prediction_file:
            # Use the same bio tags as prediction vs gold to make it obvious by looking
            # at the perl script output if something is wrong.
            write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags)
        # Run the official perl script and collect stdout.
        perl_script_command = [u"perl", unicode(self.TOOLS_ROOT / u"srl-eval.pl"), prediction_file_path, gold_file_path]
        stdout = subprocess.check_output(perl_script_command, universal_newlines=True)
        stdout_lines = stdout.split(u"\n")
        # Parse the stdout of the perl script to find the ARG1 row (this happens to be line 8).
        num_correct_arg1_instances_from_perl_evaluation = int([token for token in
                                                               stdout_lines[8].split(u" ") if token][1])
        assert num_correct_arg1_instances_from_perl_evaluation == metric._true_positives[u"ARG1"]