Python allennlp.training.metrics.SpanBasedF1Measure() Examples
The following are 12
code examples of allennlp.training.metrics.SpanBasedF1Measure().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.training.metrics
, or try the search function
.
Example #1
Source File: lstm_crf.py From allennlp_tutorial with MIT License | 6 votes |
def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder) -> None: super().__init__(vocab) self._embedder = embedder self._encoder = encoder self._classifier = torch.nn.Linear( in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels') ) self._crf = ConditionalRandomField( vocab.get_vocab_size('labels') ) self._f1 = SpanBasedF1Measure(vocab, 'labels')
Example #2
Source File: crf_tagger.py From didyprog with MIT License | 5 votes |
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, label_namespace: str = "labels", constraint_type: str = None, include_start_end_transitions: bool = True, dropout: float = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self.label_namespace = label_namespace self.text_field_embedder = text_field_embedder self.num_tags = self.vocab.get_vocab_size(label_namespace) self.encoder = encoder if dropout: self.dropout = torch.nn.Dropout(dropout) else: self.dropout = None self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(), self.num_tags)) if constraint_type is not None: labels = self.vocab.get_index_to_token_vocabulary(label_namespace) constraints = allowed_transitions(constraint_type, labels) else: constraints = None self.crf = ConditionalRandomField( self.num_tags, constraints, include_start_end_transitions=include_start_end_transitions ) self.span_metric = SpanBasedF1Measure(vocab, tag_namespace=label_namespace, label_encoding=constraint_type or "BIO") check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), "text field embedding dim", "encoder input dim") initializer(self)
Example #3
Source File: span_based_f1_measure_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_bmes_span_metrics_are_computed_correctly(self, device: str): # (bmes_tags) B:0, M:1, E:2, S:3. # [S, B, M, E, S] # [S, S, S, S, S] gold_indices = [[3, 0, 1, 2, 3], [3, 3, 3, 3, 3]] gold_tensor = torch.tensor(gold_indices, device=device) prediction_tensor = torch.rand([2, 5, 4], device=device) # [S, B, E, S, S] # TP: 2, FP: 2, FN: 1. prediction_tensor[0, 0, 3] = 1 # (True positive) prediction_tensor[0, 1, 0] = 1 # (False positive prediction_tensor[0, 2, 2] = 1 # *) prediction_tensor[0, 3, 3] = 1 # (False positive) prediction_tensor[0, 4, 3] = 1 # (True positive) # [B, E, S, B, E] # TP: 1, FP: 2, FN: 4. prediction_tensor[1, 0, 0] = 1 # (False positive prediction_tensor[1, 1, 2] = 1 # *) prediction_tensor[1, 2, 3] = 1 # (True positive) prediction_tensor[1, 3, 0] = 1 # (False positive prediction_tensor[1, 4, 2] = 1 # *) metric = SpanBasedF1Measure(self.vocab, "bmes_tags", label_encoding="BMES") metric(prediction_tensor, gold_tensor) # TP: 3, FP: 4, FN: 5. metric_dict = metric.get_metric() assert_allclose(metric_dict["recall-overall"], 0.375, rtol=0.001, atol=1e-03) assert_allclose(metric_dict["precision-overall"], 0.428, rtol=0.001, atol=1e-03) assert_allclose(metric_dict["f1-measure-overall"], 0.4, rtol=0.001, atol=1e-03)
Example #4
Source File: span_based_f1_measure_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_span_f1_accepts_tags_to_spans_function_argument(self, device: str): def mock_tags_to_spans_function(tag_sequence, classes_to_ignore=None): return [("mock", (42, 42))] # Should be ignore. bio_tags = ["B-ARG1", "O", "B-C-ARG1", "B-V", "B-ARGM-ADJ", "O"] gold_indices = [self.vocab.get_token_index(x, "tags") for x in bio_tags] gold_tensor = torch.tensor([gold_indices], device=device) prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")], device=device) metric = SpanBasedF1Measure( self.vocab, "tags", label_encoding=None, tags_to_spans_function=mock_tags_to_spans_function, ) metric(prediction_tensor, gold_tensor) metric_dict = metric.get_metric() assert_allclose(metric_dict["recall-overall"], 1.0) assert_allclose(metric_dict["precision-overall"], 1.0) assert_allclose(metric_dict["f1-measure-overall"], 1.0) with pytest.raises(ConfigurationError): SpanBasedF1Measure(self.vocab, label_encoding="INVALID") with pytest.raises(ConfigurationError): SpanBasedF1Measure(self.vocab, tags_to_spans_function=mock_tags_to_spans_function) with pytest.raises(ConfigurationError): SpanBasedF1Measure(self.vocab, label_encoding=None, tags_to_spans_function=None)
Example #5
Source File: semantic_role_labeler.py From magnitude with MIT License | 5 votes |
def __init__(self, vocab , text_field_embedder , encoder , binary_feature_dim , embedding_dropout = 0.0, initializer = InitializerApplicator(), regularizer = None, label_smoothing = None) : super(SemanticRoleLabeler, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size(u"labels") # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SpanBasedF1Measure(vocab, tag_namespace=u"labels", ignore_classes=[u"V"]) self.encoder = encoder # There are exactly 2 binary features for the verb predicate embedding. self.binary_feature_embedding = Embedding(2, binary_feature_dim) self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(), self.num_classes)) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing check_dimensions_match(text_field_embedder.get_output_dim() + binary_feature_dim, encoder.get_input_dim(), u"text embedding dim + verb indicator embedding dim", u"encoder input dim") initializer(self)
Example #6
Source File: prolocal_model.py From propara with Apache License 2.0 | 5 votes |
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, seq2seq_encoder: Seq2SeqEncoder, initializer: InitializerApplicator) -> None: super(ProLocalModel, self).__init__(vocab) self.text_field_embedder = text_field_embedder self.seq2seq_encoder = seq2seq_encoder self.attention_layer = \ Attention(similarity_function=BilinearSimilarity(2 * seq2seq_encoder.get_output_dim(), seq2seq_encoder.get_output_dim()), normalize=True) self.num_types = self.vocab.get_vocab_size("state_change_type_labels") self.aggregate_feedforward = Linear(seq2seq_encoder.get_output_dim(), self.num_types) self.span_metric = SpanBasedF1Measure(vocab, tag_namespace="state_change_tags") # by default "O" is ignored in metric computation self.num_tags = self.vocab.get_vocab_size("state_change_tags") self.tag_projection_layer = TimeDistributed(Linear(self.seq2seq_encoder.get_output_dim() + 2 , self.num_tags)) self._type_accuracy = CategoricalAccuracy() self.type_f1_metrics = {} self.type_labels_vocab = self.vocab.get_index_to_token_vocabulary("state_change_type_labels") for type_label in self.type_labels_vocab.values(): self.type_f1_metrics["type_" + type_label] = F1Measure(self.vocab.get_token_index(type_label, "state_change_type_labels")) self._loss = torch.nn.CrossEntropyLoss() initializer(self)
Example #7
Source File: lstm.py From allennlp_tutorial with MIT License | 5 votes |
def __init__(self, vocab: Vocabulary, embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder) -> None: super().__init__(vocab) self._embedder = embedder self._encoder = encoder self._classifier = torch.nn.Linear(in_features=encoder.get_output_dim(), out_features=vocab.get_vocab_size('labels')) self._f1 = SpanBasedF1Measure(vocab, 'labels', 'IOB1')
Example #8
Source File: simple_tagger.py From HIT-SCIR-CoNLL2019 with Apache License 2.0 | 5 votes |
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, calculate_span_f1: bool = None, label_encoding: Optional[str] = None, label_namespace: str = "labels", verbose_metrics: bool = False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(SimpleTagger, self).__init__(vocab, regularizer) self.label_namespace = label_namespace self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size(label_namespace) self.encoder = encoder self._verbose_metrics = verbose_metrics self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(), self.num_classes)) check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), "text field embedding dim", "encoder input dim") # We keep calculate_span_f1 as a constructor argument for API consistency with # the CrfTagger, even it is redundant in this class # (label_encoding serves the same purpose). if calculate_span_f1 and not label_encoding: raise ConfigurationError("calculate_span_f1 is True, but " "no label_encoding was specified.") self.metrics = { "accuracy": CategoricalAccuracy(), "accuracy3": CategoricalAccuracy(top_k=3) } if calculate_span_f1 or label_encoding: self._f1_metric = SpanBasedF1Measure(vocab, tag_namespace=label_namespace, label_encoding=label_encoding) else: self._f1_metric = None initializer(self)
Example #9
Source File: simple_tagger.py From allennlp with Apache License 2.0 | 4 votes |
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, calculate_span_f1: bool = None, label_encoding: Optional[str] = None, label_namespace: str = "labels", verbose_metrics: bool = False, initializer: InitializerApplicator = InitializerApplicator(), **kwargs, ) -> None: super().__init__(vocab, **kwargs) self.label_namespace = label_namespace self.text_field_embedder = text_field_embedder self.num_classes = self.vocab.get_vocab_size(label_namespace) self.encoder = encoder self._verbose_metrics = verbose_metrics self.tag_projection_layer = TimeDistributed( Linear(self.encoder.get_output_dim(), self.num_classes) ) check_dimensions_match( text_field_embedder.get_output_dim(), encoder.get_input_dim(), "text field embedding dim", "encoder input dim", ) self.metrics = { "accuracy": CategoricalAccuracy(), "accuracy3": CategoricalAccuracy(top_k=3), } # We keep calculate_span_f1 as a constructor argument for API consistency with # the CrfTagger, even it is redundant in this class # (label_encoding serves the same purpose). if calculate_span_f1 is None: calculate_span_f1 = label_encoding is not None self.calculate_span_f1 = calculate_span_f1 if calculate_span_f1: if not label_encoding: raise ConfigurationError( "calculate_span_f1 is True, but no label_encoding was specified." ) self._f1_metric = SpanBasedF1Measure( vocab, tag_namespace=label_namespace, label_encoding=label_encoding ) else: self._f1_metric = None initializer(self)
Example #10
Source File: crf_tagger.py From magnitude with MIT License | 4 votes |
def __init__(self, vocab , text_field_embedder , encoder , label_namespace = u"labels", constraint_type = None, feedforward = None, include_start_end_transitions = True, dropout = None, verbose_metrics = False, initializer = InitializerApplicator(), regularizer = None) : super(CrfTagger, self).__init__(vocab, regularizer) self.label_namespace = label_namespace self.text_field_embedder = text_field_embedder self.num_tags = self.vocab.get_vocab_size(label_namespace) self.encoder = encoder self._verbose_metrics = verbose_metrics if dropout: self.dropout = torch.nn.Dropout(dropout) else: self.dropout = None self._feedforward = feedforward if feedforward is not None: output_dim = feedforward.get_output_dim() else: output_dim = self.encoder.get_output_dim() self.tag_projection_layer = TimeDistributed(Linear(output_dim, self.num_tags)) if constraint_type is not None: labels = self.vocab.get_index_to_token_vocabulary(label_namespace) constraints = allowed_transitions(constraint_type, labels) else: constraints = None self.crf = ConditionalRandomField( self.num_tags, constraints, include_start_end_transitions=include_start_end_transitions ) self.span_metric = SpanBasedF1Measure(vocab, tag_namespace=label_namespace, label_encoding=constraint_type or u"BIO") check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), u"text field embedding dim", u"encoder input dim") if feedforward is not None: check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(), u"encoder output dim", u"feedforward input dim") initializer(self) #overrides
Example #11
Source File: span_based_f1_measure_test.py From magnitude with MIT License | 4 votes |
def test_span_metrics_are_computed_correctly(self): gold_labels = [u"O", u"B-ARG1", u"I-ARG1", u"O", u"B-ARG2", u"I-ARG2", u"O", u"O", u"O"] gold_indices = [self.vocab.get_token_index(x, u"tags") for x in gold_labels] gold_tensor = torch.Tensor([gold_indices]) prediction_tensor = torch.rand([2, 9, self.vocab.get_vocab_size(u"tags")]) # Test that the span measure ignores completely masked sequences by # passing a mask with a fully masked row. mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0]]) prediction_tensor[:, 0, 0] = 1 prediction_tensor[:, 1, 1] = 1 # (True positive - ARG1 prediction_tensor[:, 2, 2] = 1 # *) prediction_tensor[:, 3, 0] = 1 prediction_tensor[:, 4, 0] = 1 # (False Negative - ARG2 prediction_tensor[:, 5, 0] = 1 # *) prediction_tensor[:, 6, 0] = 1 prediction_tensor[:, 7, 1] = 1 # (False Positive - ARG1 prediction_tensor[:, 8, 2] = 1 # *) metric = SpanBasedF1Measure(self.vocab, u"tags") metric(prediction_tensor, gold_tensor, mask) assert metric._true_positives[u"ARG1"] == 1 assert metric._true_positives[u"ARG2"] == 0 assert u"O" not in list(metric._true_positives.keys()) assert metric._false_negatives[u"ARG1"] == 0 assert metric._false_negatives[u"ARG2"] == 1 assert u"O" not in list(metric._false_negatives.keys()) assert metric._false_positives[u"ARG1"] == 1 assert metric._false_positives[u"ARG2"] == 0 assert u"O" not in list(metric._false_positives.keys()) # Check things are accumulating correctly. metric(prediction_tensor, gold_tensor, mask) assert metric._true_positives[u"ARG1"] == 2 assert metric._true_positives[u"ARG2"] == 0 assert u"O" not in list(metric._true_positives.keys()) assert metric._false_negatives[u"ARG1"] == 0 assert metric._false_negatives[u"ARG2"] == 2 assert u"O" not in list(metric._false_negatives.keys()) assert metric._false_positives[u"ARG1"] == 2 assert metric._false_positives[u"ARG2"] == 0 assert u"O" not in list(metric._false_positives.keys()) metric_dict = metric.get_metric() numpy.testing.assert_almost_equal(metric_dict[u"recall-ARG2"], 0.0) numpy.testing.assert_almost_equal(metric_dict[u"precision-ARG2"], 0.0) numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARG2"], 0.0) numpy.testing.assert_almost_equal(metric_dict[u"recall-ARG1"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"precision-ARG1"], 0.5) numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARG1"], 0.666666666) numpy.testing.assert_almost_equal(metric_dict[u"recall-overall"], 0.5) numpy.testing.assert_almost_equal(metric_dict[u"precision-overall"], 0.5) numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-overall"], 0.5)
Example #12
Source File: span_based_f1_measure_test.py From magnitude with MIT License | 4 votes |
def test_span_f1_matches_perl_script_for_continued_arguments(self): bio_tags = [u"B-ARG1", u"O", u"B-C-ARG1", u"B-V", u"B-ARGM-ADJ", u"O"] sentence = [u"Mark", u"and", u"Matt", u"were", u"running", u"fast", u"."] gold_indices = [self.vocab.get_token_index(x, u"tags") for x in bio_tags] gold_tensor = torch.Tensor([gold_indices]) prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size(u"tags")]) mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]]) # Make prediction so that it is exactly correct. for i, tag_index in enumerate(gold_indices): prediction_tensor[0, i, tag_index] = 1 metric = SpanBasedF1Measure(self.vocab, u"tags") metric(prediction_tensor, gold_tensor, mask) metric_dict = metric.get_metric() # We merged the continued ARG1 label into a single span, so there should # be exactly 1 true positive for ARG1 and nothing present for C-ARG1 assert metric._true_positives[u"ARG1"] == 1 # The labels containing continuation references get merged into # the labels that they continue, so they should never appear in # the precision/recall counts. assert u"C-ARG1" not in list(metric._true_positives.keys()) assert metric._true_positives[u"V"] == 1 assert metric._true_positives[u"ARGM-ADJ"] == 1 numpy.testing.assert_almost_equal(metric_dict[u"recall-ARG1"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"precision-ARG1"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARG1"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"recall-V"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"precision-V"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-V"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"recall-ARGM-ADJ"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"precision-ARGM-ADJ"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-ARGM-ADJ"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"recall-overall"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"precision-overall"], 1.0) numpy.testing.assert_almost_equal(metric_dict[u"f1-measure-overall"], 1.0) # Check that the number of true positive ARG1 labels is the same as the perl script's output: gold_file_path = os.path.join(self.TEST_DIR, u"gold_conll_eval.txt") prediction_file_path = os.path.join(self.TEST_DIR, u"prediction_conll_eval.txt") with open(gold_file_path, u"a+") as gold_file, open(prediction_file_path, u"a+") as prediction_file: # Use the same bio tags as prediction vs gold to make it obvious by looking # at the perl script output if something is wrong. write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags) # Run the official perl script and collect stdout. perl_script_command = [u"perl", unicode(self.TOOLS_ROOT / u"srl-eval.pl"), prediction_file_path, gold_file_path] stdout = subprocess.check_output(perl_script_command, universal_newlines=True) stdout_lines = stdout.split(u"\n") # Parse the stdout of the perl script to find the ARG1 row (this happens to be line 8). num_correct_arg1_instances_from_perl_evaluation = int([token for token in stdout_lines[8].split(u" ") if token][1]) assert num_correct_arg1_instances_from_perl_evaluation == metric._true_positives[u"ARG1"]