Python Examples of allennlp.data.fields.SpanField

Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0

6 votes

def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         entity_1: Tuple[int],
                         entity_2: Tuple[int],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
            
        sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields)

Source File: span_field_test.py From magnitude with MIT License

5 votes

def test_empty_span_field_works(self):
        span_field = SpanField(1, 3, self.text)
        empty_span = span_field.empty_field()
        assert empty_span.span_start == -1
        assert empty_span.span_end == -1

Source File: test_dict_field.py From kb with Apache License 2.0

5 votes

def test_list_field_of_dict_field(self):
        from allennlp.data import Instance
        from allennlp.data.iterators import BasicIterator

        tokens3 = "The long sentence .".split()
        tokens3_field = TextField(
            [Token(t) for t in tokens3],
            token_indexers={'tokens': SingleIdTokenIndexer()}
        )

        instance3_fields = {
            "candidate_entities": TextField(
                    [Token("entity1 entity2 entity3"), Token("entity_unk"), Token("entity2 entity3")],
                    token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array([[0.1, 0.1, 0.8],
                                                           [1.0, 0.0, 0.0],
                                                           [0.33, 0.67, 0.0]])),
            "candidate_spans": ListField(
                    [SpanField(1, 1, tokens3_field), SpanField(1, 2, tokens3_field), SpanField(1, 3, tokens3_field)],
            )
        }

        iterator = BasicIterator()
        iterator.index_with(self.vocab)

        instances = [Instance({"candidates": ListField([
                                    DictField(self.instance1_fields),
                                    DictField(self.instance2_fields)])}),
                     Instance({"candidates": ListField([
                                    DictField(self.instance1_fields),
                                    DictField(instance3_fields)])})
        ]

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            pass

        self.assertTrue(batch['candidates']['candidate_entities']['entity'].shape == batch['candidates']['candidate_entity_prior'].shape)

Source File: paired_span_pred_reader.py From semanticRetrievalMRS with MIT License

5 votes

def text_to_instance(self,  # type: ignore
                         example) -> Instance:

        fields: Dict[str, Field] = {}

        joint_tokens_seq = example['paired_c_tokens']
        assert len(joint_tokens_seq) <= 512

        segments_ids = example['segment_ids']

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        # This text span is begin inclusive and end exclusive.
        # text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use)
        # text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens']))

        # fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span)
        # fields['bert_s1_span'] = MetadataField(text1_span)
        # fields['bert_s2_span'] = MetadataField(text2_span)

        # However, the ground truth span is begin and end both inclusive
        fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence'])

        fields['fid'] = IdField(example['fid'])
        fields['uid'] = IdField(example['uid'])

        return Instance(fields)

Source File: span_pred_reader.py From semanticRetrievalMRS with MIT License

5 votes

def text_to_instance(self,  # type: ignore
                         example) -> Instance:

        fields: Dict[str, Field] = {}

        joint_tokens_seq = ['[CLS]'] + example['query_c_tokens'] + ['[SEP]'] + example['context_c_tokens'] + ['[SEP]']
        assert len(joint_tokens_seq) < 512

        text1_len = len(example['query_c_tokens']) + 2
        text2_len = len(example['context_c_tokens']) + 1

        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        # This text span is begin inclusive and end exclusive.
        text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens']))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span)
        # fields['bert_s1_span'] = MetadataField(text1_span)
        # fields['bert_s2_span'] = MetadataField(text2_span)

        # However, the ground truth span is begin and end both inclusive
        fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence'])

        fields['fid'] = IdField(example['fid'])
        fields['uid'] = IdField(example['uid'])

        return Instance(fields)

Source File: bert_fever_reader.py From semanticRetrievalMRS with MIT License

5 votes

def text_to_instance(self,  # type: ignore
                         sent1: str,  # Important type information
                         sent2: str,
                         pid: str = None,
                         label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(sent1)
        tokenized_text2 = self.bert_tokenizer.tokenize(sent2)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.s1_l]
        tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))]

        joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]']
        text1_len = len(tokenized_text1) + 2
        text2_len = len(tokenized_text2) + 1
        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields)

Source File: span_field_test.py From magnitude with MIT License

5 votes

def test_span_field_raises_if_span_end_is_greater_than_sentence_length(self):
        with pytest.raises(ValueError):
            _ = SpanField(1, 30, self.text)

Source File: span_field_test.py From magnitude with MIT License

5 votes

def test_span_field_raises_on_ill_defined_span(self):
        with pytest.raises(ValueError):
            _ = SpanField(4, 1, self.text)

Source File: span_field_test.py From magnitude with MIT License

5 votes

def test_span_field_raises_on_incorrect_label_type(self):
        with pytest.raises(TypeError):
            _ = SpanField(u"hello", 3, self.text)

Source File: span_field_test.py From magnitude with MIT License

5 votes

def test_as_tensor_converts_span_field_correctly(self):
        span_field = SpanField(2, 3, self.text)
        tensor = span_field.as_tensor(span_field.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_equal(tensor, numpy.array([2, 3]))