Python allennlp.data.fields.SpanField() Examples

The following are 22 code examples of allennlp.data.fields.SpanField(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.fields , or try the search function .
Example #1
Source File: semeval_2010_task_8_reader.py    From DISTRE with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         entity_1: Tuple[int],
                         entity_2: Tuple[int],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
            
        sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields) 
Example #2
Source File: span_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_empty_span_field_works(self):
        span_field = SpanField(1, 3, self.text)
        empty_span = span_field.empty_field()
        assert empty_span.span_start == -1
        assert empty_span.span_end == -1 
Example #3
Source File: test_dict_field.py    From kb with Apache License 2.0 5 votes vote down vote up
def test_list_field_of_dict_field(self):
        from allennlp.data import Instance
        from allennlp.data.iterators import BasicIterator

        tokens3 = "The long sentence .".split()
        tokens3_field = TextField(
            [Token(t) for t in tokens3],
            token_indexers={'tokens': SingleIdTokenIndexer()}
        )

        instance3_fields = {
            "candidate_entities": TextField(
                    [Token("entity1 entity2 entity3"), Token("entity_unk"), Token("entity2 entity3")],
                    token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array([[0.1, 0.1, 0.8],
                                                           [1.0, 0.0, 0.0],
                                                           [0.33, 0.67, 0.0]])),
            "candidate_spans": ListField(
                    [SpanField(1, 1, tokens3_field), SpanField(1, 2, tokens3_field), SpanField(1, 3, tokens3_field)],
            )
        }

        iterator = BasicIterator()
        iterator.index_with(self.vocab)

        instances = [Instance({"candidates": ListField([
                                    DictField(self.instance1_fields),
                                    DictField(self.instance2_fields)])}),
                     Instance({"candidates": ListField([
                                    DictField(self.instance1_fields),
                                    DictField(instance3_fields)])})
        ]

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            pass

        self.assertTrue(batch['candidates']['candidate_entities']['entity'].shape == batch['candidates']['candidate_entity_prior'].shape) 
Example #4
Source File: paired_span_pred_reader.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         example) -> Instance:

        fields: Dict[str, Field] = {}

        joint_tokens_seq = example['paired_c_tokens']
        assert len(joint_tokens_seq) <= 512

        segments_ids = example['segment_ids']

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        # This text span is begin inclusive and end exclusive.
        # text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use)
        # text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens']))

        # fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span)
        # fields['bert_s1_span'] = MetadataField(text1_span)
        # fields['bert_s2_span'] = MetadataField(text2_span)

        # However, the ground truth span is begin and end both inclusive
        fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence'])

        fields['fid'] = IdField(example['fid'])
        fields['uid'] = IdField(example['uid'])

        return Instance(fields) 
Example #5
Source File: span_pred_reader.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         example) -> Instance:

        fields: Dict[str, Field] = {}

        joint_tokens_seq = ['[CLS]'] + example['query_c_tokens'] + ['[SEP]'] + example['context_c_tokens'] + ['[SEP]']
        assert len(joint_tokens_seq) < 512

        text1_len = len(example['query_c_tokens']) + 2
        text2_len = len(example['context_c_tokens']) + 1

        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        # This text span is begin inclusive and end exclusive.
        text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens']))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span)
        # fields['bert_s1_span'] = MetadataField(text1_span)
        # fields['bert_s2_span'] = MetadataField(text2_span)

        # However, the ground truth span is begin and end both inclusive
        fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence'])

        fields['fid'] = IdField(example['fid'])
        fields['uid'] = IdField(example['uid'])

        return Instance(fields) 
Example #6
Source File: bert_fever_reader.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         sent1: str,  # Important type information
                         sent2: str,
                         pid: str = None,
                         label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(sent1)
        tokenized_text2 = self.bert_tokenizer.tokenize(sent2)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.s1_l]
        tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))]

        joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]']
        text1_len = len(tokenized_text1) + 2
        text2_len = len(tokenized_text2) + 1
        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields) 
Example #7
Source File: span_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_span_field_raises_if_span_end_is_greater_than_sentence_length(self):
        with pytest.raises(ValueError):
            _ = SpanField(1, 30, self.text) 
Example #8
Source File: span_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_span_field_raises_on_ill_defined_span(self):
        with pytest.raises(ValueError):
            _ = SpanField(4, 1, self.text) 
Example #9
Source File: span_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_span_field_raises_on_incorrect_label_type(self):
        with pytest.raises(TypeError):
            _ = SpanField(u"hello", 3, self.text) 
Example #10
Source File: span_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_as_tensor_converts_span_field_correctly(self):
        span_field = SpanField(2, 3, self.text)
        tensor = span_field.as_tensor(span_field.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_equal(tensor, numpy.array([2, 3])) 
Example #11
Source File: span_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_equality(self):
        span_field1 = SpanField(2, 3, self.text)
        span_field2 = SpanField(2, 3, self.text)
        span_field3 = SpanField(
            2, 3, TextField([Token(t) for t in ["not", "the", "same", "tokens"]], self.indexers)
        )

        assert span_field1 == (2, 3)
        assert span_field1 == span_field1
        assert span_field1 == span_field2
        assert span_field1 != span_field3
        assert span_field2 != span_field3 
Example #12
Source File: span_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_empty_span_field_works(self):
        span_field = SpanField(1, 3, self.text)
        empty_span = span_field.empty_field()
        assert empty_span.span_start == -1
        assert empty_span.span_end == -1 
Example #13
Source File: span_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_span_field_raises_if_span_end_is_greater_than_sentence_length(self):
        with pytest.raises(ValueError):
            _ = SpanField(1, 30, self.text) 
Example #14
Source File: span_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_span_field_raises_on_ill_defined_span(self):
        with pytest.raises(ValueError):
            _ = SpanField(4, 1, self.text) 
Example #15
Source File: span_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_span_field_raises_on_incorrect_label_type(self):
        with pytest.raises(TypeError):
            _ = SpanField("hello", 3, self.text) 
Example #16
Source File: span_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_as_tensor_converts_span_field_correctly(self):
        span_field = SpanField(2, 3, self.text)
        tensor = span_field.as_tensor(span_field.get_padding_lengths()).detach().cpu().numpy()
        numpy.testing.assert_array_equal(tensor, numpy.array([2, 3])) 
Example #17
Source File: bert_reader_context_selection.py    From semanticRetrievalMRS with MIT License 4 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         query: str,  # Important type information
                         context: str,
                         fid: str = None,
                         qid: str = None,
                         selection_label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(query)
        tokenized_text2 = self.bert_tokenizer.tokenize(context)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.query_l]
        tokenized_text2 = tokenized_text2[:self.context_l]

        s1_tokens_seq = ['[CLS]'] + tokenized_text1
        s2_tokens_seq = ['[CLS]'] + tokenized_text2

        # text1_len = len(tokenized_text1) + 1
        # text2_len = len(tokenized_text2) + 1

        # segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        s1_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(s1_tokens_seq)
        s2_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(s2_tokens_seq)

        fields['s1_sequence'] = BertIndexField(np.asarray(s1_tokens_ids, dtype=np.int64))
        fields['s2_sequence'] = BertIndexField(np.asarray(s2_tokens_ids, dtype=np.int64))

        text1_span = (1, len(tokenized_text1)) # End is exclusive (important for later use)
        text2_span = (1, len(tokenized_text2))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['s1_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['s2_sequence'])

        if selection_label:
            fields['label'] = LabelField(selection_label, label_namespace='labels')

        assert fid is not None
        assert qid is not None
        fields['fid'] = IdField(fid)
        fields['qid'] = IdField(qid)

        return Instance(fields) 
Example #18
Source File: bert_fever_verification_separate_seq.py    From semanticRetrievalMRS with MIT License 4 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         s1: str,  # Important type information
                         s2: str,
                         pid: str,
                         selection_label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(s1)
        tokenized_text2 = self.bert_tokenizer.tokenize(s2)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.s1_l]
        tokenized_text2 = tokenized_text2[:self.s2_l]

        s1_tokens_seq = ['[CLS]'] + tokenized_text1
        s2_tokens_seq = ['[CLS]'] + tokenized_text2

        # text1_len = len(tokenized_text1) + 1
        # text2_len = len(tokenized_text2) + 1

        # segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        s1_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(s1_tokens_seq)
        s2_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(s2_tokens_seq)

        fields['s1_sequence'] = BertIndexField(np.asarray(s1_tokens_ids, dtype=np.int64))
        fields['s2_sequence'] = BertIndexField(np.asarray(s2_tokens_ids, dtype=np.int64))

        text1_span = (1, len(tokenized_text1)) # End is exclusive (important for later use)
        text2_span = (1, len(tokenized_text2))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['s1_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['s2_sequence'])

        if selection_label:
            fields['label'] = LabelField(selection_label, label_namespace='labels')

        # assert fid is not None
        assert pid is not None
        # fields['fid'] = IdField(fid)
        fields['pid'] = IdField(pid)

        return Instance(fields) 
Example #19
Source File: wiki_linking_reader.py    From kb with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,
                         tokenized_text: List[str],
                         candidate_entities: List[List[str]],
                         candidate_spans: List[List[int]],
                         candidate_entity_prior: List[List[float]],
                         gold_entities: List[str] = None,
                         doc_id: str = None):

        assert doc_id is not None

        token_field = TextField([Token(x) for x in tokenized_text], self.token_indexers)
        span_fields = ListField([SpanField(*span, token_field) for span in candidate_spans])

        candidate_entities = TextField(
                [Token(" ".join(candidate_list)) for candidate_list in candidate_entities],
                token_indexers=self.entity_indexer)

        max_cands = max(len(p) for p in candidate_entity_prior)
        for p in candidate_entity_prior:
            if len(p) < max_cands:
                p.extend([0.0] * (max_cands - len(p)))
        np_prior = np.array(candidate_entity_prior)
        prior_field = ArrayField(np_prior)

        # only one segment
        candidate_segment_ids = ArrayField(
                np.array([0] * len(candidate_entities)), dtype=np.int
        )

        fields = {
            "tokens": token_field,
            "candidate_spans": span_fields,
            "candidate_entities": candidate_entities,
            "candidate_entity_prior": prior_field,
            "candidate_segment_ids": candidate_segment_ids
            }
        if gold_entities:
            labels = TextField([Token(entity) for entity in gold_entities],
                               token_indexers=self.entity_indexer)
            fields["gold_entities"] = labels

        fields["doc_id"] = MetadataField(doc_id)

        if self.extra_candidate_generators:
            tokens = " ".join(tokenized_text)
            extra_candidates = {
                    key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True)
                    for key, generator in self.extra_candidate_generators.items()
            }
            fields['extra_candidates'] = MetadataField(extra_candidates)

        return Instance(fields, should_remap_span_indices=self.should_remap_span_indices) 
Example #20
Source File: wordnet.py    From kb with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,
                         tokens: List[str],
                         candidate_entities: List[List[str]],
                         candidate_spans: List[List[int]],
                         candidate_entity_prior: List[List[float]],
                         gold_entities: List[str] = None,
                         gold_data_ids: List[str] = None):

        # prior needs to be 2D and full
        # can look like [[0.2, 0.8], [1.0]]  if one candidate for second
        # candidate span and two candidates for first
        max_cands = max(len(p) for p in candidate_entity_prior)
        for p in candidate_entity_prior:
            if len(p) < max_cands:
                p.extend([0.0] * (max_cands - len(p)))
        np_prior = np.array(candidate_entity_prior)

        fields = {
            "tokens": TextField([Token(t) for t in tokens],
                      token_indexers=self.token_indexers),

            # join by space, then retokenize in the "character indexer"
            "candidate_entities": TextField(
                [Token(" ".join(candidate_list)) for candidate_list in candidate_entities],
                token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array(np_prior)),
            # only one sentence
            "candidate_segment_ids": ArrayField(
                np.array([0] * len(candidate_entities)), dtype=np.int
            )
        }

        if gold_entities is not None:
            fields["gold_entities"] =  TextField([Token(entity) for entity in gold_entities],
                                                  token_indexers=self.entity_indexer)
        if gold_data_ids is not None:
            fields["gold_data_ids"] = MetadataField(gold_data_ids)

        span_fields = []
        for span in candidate_spans:
            span_fields.append(SpanField(span[0], span[1], fields['tokens']))
        fields['candidate_spans'] = ListField(span_fields)

        if self.extra_candidate_generators:
            tokens = " ".join(tokens)
            extra_candidates = {
                    key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True)
                    for key, generator in self.extra_candidate_generators.items()
            }
            fields['extra_candidates'] = MetadataField(extra_candidates)

        return Instance(fields, should_remap_span_indices=self.should_remap_span_indices) 
Example #21
Source File: bert_tokenizer_and_candidate_generator.py    From kb with Apache License 2.0 4 votes vote down vote up
def convert_tokens_candidates_to_fields(self, tokens_and_candidates):
        """
        tokens_and_candidates is the return from a previous call to
        generate_sentence_entity_candidates.  Converts the dict to
        a dict of fields usable with allennlp.
        """
        fields = {}

        fields['tokens'] = TextField(
                [Token(t, text_id=self.bert_tokenizer.vocab[t])
                    for t in tokens_and_candidates['tokens']],
                token_indexers=self._bert_single_id_indexer
        )

        fields['segment_ids'] = ArrayField(
            np.array(tokens_and_candidates['segment_ids']), dtype=np.int
        )

        all_candidates = {}
        for key, entity_candidates in tokens_and_candidates['candidates'].items():
            # pad the prior to create the array field
            # make a copy to avoid modifying the input
            candidate_entity_prior = copy.deepcopy(
                    entity_candidates['candidate_entity_priors']
            )
            max_cands = max(len(p) for p in candidate_entity_prior)
            for p in candidate_entity_prior:
                if len(p) < max_cands:
                    p.extend([0.0] * (max_cands - len(p)))
            np_prior = np.array(candidate_entity_prior)

            candidate_fields = {
                "candidate_entity_priors": ArrayField(np_prior, dtype=self.dtype),
                "candidate_entities": TextField(
                    [Token(" ".join(candidate_list)) for candidate_list in entity_candidates["candidate_entities"]],
                    token_indexers={'ids': self._entity_indexers[key]}),
                "candidate_spans": ListField(
                    [SpanField(span[0], span[1], fields['tokens']) for span in
                    entity_candidates['candidate_spans']]
                ),
                "candidate_segment_ids": ArrayField(
                    np.array(entity_candidates['candidate_segment_ids']), dtype=np.int
        )
            }
            all_candidates[key] = DictField(candidate_fields)

        fields["candidates"] = DictField(all_candidates)

        return fields 
Example #22
Source File: test_dict_field.py    From kb with Apache License 2.0 4 votes vote down vote up
def setUp(self):
        super(TestDictField, self).setUp()

        entity_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter())

        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("entity1", "entity")
        self.vocab.add_token_to_namespace("entity2", "entity")
        self.vocab.add_token_to_namespace("entity3", "entity")
        self.entity_indexer = {"entity": TokenCharactersIndexerTokenizer(
            "entity", character_tokenizer=entity_tokenizer)
        }

        tokens1 = "The sentence .".split()
        tokens_field = TextField(
            [Token(t) for t in tokens1],
            token_indexers={'tokens': SingleIdTokenIndexer()}
        )

        self.instance1_fields = {
            "candidate_entities": TextField(
                    [Token("entity1 entity2"), Token("entity_unk")],
                    token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array([[0.5, 0.5], [1.0, 0.0]])),
            "candidate_spans": ListField(
                    [SpanField(0, 0, tokens_field),
                     SpanField(1, 2, tokens_field)]
            )
        }

        tokens2 = "The sentence".split()
        tokens2_field = TextField(
            [Token(t) for t in tokens2], 
            token_indexers={'tokens': SingleIdTokenIndexer()}
        )

        self.instance2_fields = {
            "candidate_entities": TextField(
                    [Token("entity1")], 
                    token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array([[1.0]])),
            "candidate_spans": ListField(
                    [SpanField(1, 1, tokens2_field)],
            )
        }