Python allennlp.data.fields.SpanField() Examples
The following are 22
code examples of allennlp.data.fields.SpanField().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.fields
, or try the search function
.
Example #1
Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore tokens: List[str], entity_1: Tuple[int], entity_2: Tuple[int], label: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = [OpenAISplitter._standardize(token) for token in tokens] tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__'] sentence = TextField([Token(text=t) for t in tokens], self._token_indexers) fields['sentence'] = sentence #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence) #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence) if label: fields['label'] = LabelField(label) return Instance(fields)
Example #2
Source File: span_field_test.py From magnitude with MIT License | 5 votes |
def test_empty_span_field_works(self): span_field = SpanField(1, 3, self.text) empty_span = span_field.empty_field() assert empty_span.span_start == -1 assert empty_span.span_end == -1
Example #3
Source File: test_dict_field.py From kb with Apache License 2.0 | 5 votes |
def test_list_field_of_dict_field(self): from allennlp.data import Instance from allennlp.data.iterators import BasicIterator tokens3 = "The long sentence .".split() tokens3_field = TextField( [Token(t) for t in tokens3], token_indexers={'tokens': SingleIdTokenIndexer()} ) instance3_fields = { "candidate_entities": TextField( [Token("entity1 entity2 entity3"), Token("entity_unk"), Token("entity2 entity3")], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array([[0.1, 0.1, 0.8], [1.0, 0.0, 0.0], [0.33, 0.67, 0.0]])), "candidate_spans": ListField( [SpanField(1, 1, tokens3_field), SpanField(1, 2, tokens3_field), SpanField(1, 3, tokens3_field)], ) } iterator = BasicIterator() iterator.index_with(self.vocab) instances = [Instance({"candidates": ListField([ DictField(self.instance1_fields), DictField(self.instance2_fields)])}), Instance({"candidates": ListField([ DictField(self.instance1_fields), DictField(instance3_fields)])}) ] for batch in iterator(instances, num_epochs=1, shuffle=False): pass self.assertTrue(batch['candidates']['candidate_entities']['entity'].shape == batch['candidates']['candidate_entity_prior'].shape)
Example #4
Source File: paired_span_pred_reader.py From semanticRetrievalMRS with MIT License | 5 votes |
def text_to_instance(self, # type: ignore example) -> Instance: fields: Dict[str, Field] = {} joint_tokens_seq = example['paired_c_tokens'] assert len(joint_tokens_seq) <= 512 segments_ids = example['segment_ids'] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64)) # This text span is begin inclusive and end exclusive. # text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use) # text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens'])) # fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence']) # fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence']) # fields['bert_s2_span'] = SpanField(text2_span) # fields['bert_s1_span'] = MetadataField(text1_span) # fields['bert_s2_span'] = MetadataField(text2_span) # However, the ground truth span is begin and end both inclusive fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence']) fields['fid'] = IdField(example['fid']) fields['uid'] = IdField(example['uid']) return Instance(fields)
Example #5
Source File: span_pred_reader.py From semanticRetrievalMRS with MIT License | 5 votes |
def text_to_instance(self, # type: ignore example) -> Instance: fields: Dict[str, Field] = {} joint_tokens_seq = ['[CLS]'] + example['query_c_tokens'] + ['[SEP]'] + example['context_c_tokens'] + ['[SEP]'] assert len(joint_tokens_seq) < 512 text1_len = len(example['query_c_tokens']) + 2 text2_len = len(example['context_c_tokens']) + 1 segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64)) # This text span is begin inclusive and end exclusive. text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use) text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens'])) fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence']) fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence']) # fields['bert_s2_span'] = SpanField(text2_span) # fields['bert_s1_span'] = MetadataField(text1_span) # fields['bert_s2_span'] = MetadataField(text2_span) # However, the ground truth span is begin and end both inclusive fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence']) fields['fid'] = IdField(example['fid']) fields['uid'] = IdField(example['uid']) return Instance(fields)
Example #6
Source File: bert_fever_reader.py From semanticRetrievalMRS with MIT License | 5 votes |
def text_to_instance(self, # type: ignore sent1: str, # Important type information sent2: str, pid: str = None, label: str = None) -> Instance: fields: Dict[str, Field] = {} tokenized_text1 = self.bert_tokenizer.tokenize(sent1) tokenized_text2 = self.bert_tokenizer.tokenize(sent2) # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l) tokenized_text1 = tokenized_text1[:self.s1_l] tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))] joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]'] text1_len = len(tokenized_text1) + 2 text2_len = len(tokenized_text2) + 1 segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64)) text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use) text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2)) fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence']) fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence']) if label: fields['label'] = LabelField(label, label_namespace='labels') if pid: fields['pid'] = IdField(pid) return Instance(fields)
Example #7
Source File: span_field_test.py From magnitude with MIT License | 5 votes |
def test_span_field_raises_if_span_end_is_greater_than_sentence_length(self): with pytest.raises(ValueError): _ = SpanField(1, 30, self.text)
Example #8
Source File: span_field_test.py From magnitude with MIT License | 5 votes |
def test_span_field_raises_on_ill_defined_span(self): with pytest.raises(ValueError): _ = SpanField(4, 1, self.text)
Example #9
Source File: span_field_test.py From magnitude with MIT License | 5 votes |
def test_span_field_raises_on_incorrect_label_type(self): with pytest.raises(TypeError): _ = SpanField(u"hello", 3, self.text)
Example #10
Source File: span_field_test.py From magnitude with MIT License | 5 votes |
def test_as_tensor_converts_span_field_correctly(self): span_field = SpanField(2, 3, self.text) tensor = span_field.as_tensor(span_field.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_equal(tensor, numpy.array([2, 3]))
Example #11
Source File: span_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_equality(self): span_field1 = SpanField(2, 3, self.text) span_field2 = SpanField(2, 3, self.text) span_field3 = SpanField( 2, 3, TextField([Token(t) for t in ["not", "the", "same", "tokens"]], self.indexers) ) assert span_field1 == (2, 3) assert span_field1 == span_field1 assert span_field1 == span_field2 assert span_field1 != span_field3 assert span_field2 != span_field3
Example #12
Source File: span_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_empty_span_field_works(self): span_field = SpanField(1, 3, self.text) empty_span = span_field.empty_field() assert empty_span.span_start == -1 assert empty_span.span_end == -1
Example #13
Source File: span_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_span_field_raises_if_span_end_is_greater_than_sentence_length(self): with pytest.raises(ValueError): _ = SpanField(1, 30, self.text)
Example #14
Source File: span_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_span_field_raises_on_ill_defined_span(self): with pytest.raises(ValueError): _ = SpanField(4, 1, self.text)
Example #15
Source File: span_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_span_field_raises_on_incorrect_label_type(self): with pytest.raises(TypeError): _ = SpanField("hello", 3, self.text)
Example #16
Source File: span_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_as_tensor_converts_span_field_correctly(self): span_field = SpanField(2, 3, self.text) tensor = span_field.as_tensor(span_field.get_padding_lengths()).detach().cpu().numpy() numpy.testing.assert_array_equal(tensor, numpy.array([2, 3]))
Example #17
Source File: bert_reader_context_selection.py From semanticRetrievalMRS with MIT License | 4 votes |
def text_to_instance(self, # type: ignore query: str, # Important type information context: str, fid: str = None, qid: str = None, selection_label: str = None) -> Instance: fields: Dict[str, Field] = {} tokenized_text1 = self.bert_tokenizer.tokenize(query) tokenized_text2 = self.bert_tokenizer.tokenize(context) # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l) tokenized_text1 = tokenized_text1[:self.query_l] tokenized_text2 = tokenized_text2[:self.context_l] s1_tokens_seq = ['[CLS]'] + tokenized_text1 s2_tokens_seq = ['[CLS]'] + tokenized_text2 # text1_len = len(tokenized_text1) + 1 # text2_len = len(tokenized_text2) + 1 # segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)] s1_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(s1_tokens_seq) s2_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(s2_tokens_seq) fields['s1_sequence'] = BertIndexField(np.asarray(s1_tokens_ids, dtype=np.int64)) fields['s2_sequence'] = BertIndexField(np.asarray(s2_tokens_ids, dtype=np.int64)) text1_span = (1, len(tokenized_text1)) # End is exclusive (important for later use) text2_span = (1, len(tokenized_text2)) fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['s1_sequence']) fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['s2_sequence']) if selection_label: fields['label'] = LabelField(selection_label, label_namespace='labels') assert fid is not None assert qid is not None fields['fid'] = IdField(fid) fields['qid'] = IdField(qid) return Instance(fields)
Example #18
Source File: bert_fever_verification_separate_seq.py From semanticRetrievalMRS with MIT License | 4 votes |
def text_to_instance(self, # type: ignore s1: str, # Important type information s2: str, pid: str, selection_label: str = None) -> Instance: fields: Dict[str, Field] = {} tokenized_text1 = self.bert_tokenizer.tokenize(s1) tokenized_text2 = self.bert_tokenizer.tokenize(s2) # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l) tokenized_text1 = tokenized_text1[:self.s1_l] tokenized_text2 = tokenized_text2[:self.s2_l] s1_tokens_seq = ['[CLS]'] + tokenized_text1 s2_tokens_seq = ['[CLS]'] + tokenized_text2 # text1_len = len(tokenized_text1) + 1 # text2_len = len(tokenized_text2) + 1 # segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)] s1_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(s1_tokens_seq) s2_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(s2_tokens_seq) fields['s1_sequence'] = BertIndexField(np.asarray(s1_tokens_ids, dtype=np.int64)) fields['s2_sequence'] = BertIndexField(np.asarray(s2_tokens_ids, dtype=np.int64)) text1_span = (1, len(tokenized_text1)) # End is exclusive (important for later use) text2_span = (1, len(tokenized_text2)) fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['s1_sequence']) fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['s2_sequence']) if selection_label: fields['label'] = LabelField(selection_label, label_namespace='labels') # assert fid is not None assert pid is not None # fields['fid'] = IdField(fid) fields['pid'] = IdField(pid) return Instance(fields)
Example #19
Source File: wiki_linking_reader.py From kb with Apache License 2.0 | 4 votes |
def text_to_instance(self, tokenized_text: List[str], candidate_entities: List[List[str]], candidate_spans: List[List[int]], candidate_entity_prior: List[List[float]], gold_entities: List[str] = None, doc_id: str = None): assert doc_id is not None token_field = TextField([Token(x) for x in tokenized_text], self.token_indexers) span_fields = ListField([SpanField(*span, token_field) for span in candidate_spans]) candidate_entities = TextField( [Token(" ".join(candidate_list)) for candidate_list in candidate_entities], token_indexers=self.entity_indexer) max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) prior_field = ArrayField(np_prior) # only one segment candidate_segment_ids = ArrayField( np.array([0] * len(candidate_entities)), dtype=np.int ) fields = { "tokens": token_field, "candidate_spans": span_fields, "candidate_entities": candidate_entities, "candidate_entity_prior": prior_field, "candidate_segment_ids": candidate_segment_ids } if gold_entities: labels = TextField([Token(entity) for entity in gold_entities], token_indexers=self.entity_indexer) fields["gold_entities"] = labels fields["doc_id"] = MetadataField(doc_id) if self.extra_candidate_generators: tokens = " ".join(tokenized_text) extra_candidates = { key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True) for key, generator in self.extra_candidate_generators.items() } fields['extra_candidates'] = MetadataField(extra_candidates) return Instance(fields, should_remap_span_indices=self.should_remap_span_indices)
Example #20
Source File: wordnet.py From kb with Apache License 2.0 | 4 votes |
def text_to_instance(self, tokens: List[str], candidate_entities: List[List[str]], candidate_spans: List[List[int]], candidate_entity_prior: List[List[float]], gold_entities: List[str] = None, gold_data_ids: List[str] = None): # prior needs to be 2D and full # can look like [[0.2, 0.8], [1.0]] if one candidate for second # candidate span and two candidates for first max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) fields = { "tokens": TextField([Token(t) for t in tokens], token_indexers=self.token_indexers), # join by space, then retokenize in the "character indexer" "candidate_entities": TextField( [Token(" ".join(candidate_list)) for candidate_list in candidate_entities], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array(np_prior)), # only one sentence "candidate_segment_ids": ArrayField( np.array([0] * len(candidate_entities)), dtype=np.int ) } if gold_entities is not None: fields["gold_entities"] = TextField([Token(entity) for entity in gold_entities], token_indexers=self.entity_indexer) if gold_data_ids is not None: fields["gold_data_ids"] = MetadataField(gold_data_ids) span_fields = [] for span in candidate_spans: span_fields.append(SpanField(span[0], span[1], fields['tokens'])) fields['candidate_spans'] = ListField(span_fields) if self.extra_candidate_generators: tokens = " ".join(tokens) extra_candidates = { key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True) for key, generator in self.extra_candidate_generators.items() } fields['extra_candidates'] = MetadataField(extra_candidates) return Instance(fields, should_remap_span_indices=self.should_remap_span_indices)
Example #21
Source File: bert_tokenizer_and_candidate_generator.py From kb with Apache License 2.0 | 4 votes |
def convert_tokens_candidates_to_fields(self, tokens_and_candidates): """ tokens_and_candidates is the return from a previous call to generate_sentence_entity_candidates. Converts the dict to a dict of fields usable with allennlp. """ fields = {} fields['tokens'] = TextField( [Token(t, text_id=self.bert_tokenizer.vocab[t]) for t in tokens_and_candidates['tokens']], token_indexers=self._bert_single_id_indexer ) fields['segment_ids'] = ArrayField( np.array(tokens_and_candidates['segment_ids']), dtype=np.int ) all_candidates = {} for key, entity_candidates in tokens_and_candidates['candidates'].items(): # pad the prior to create the array field # make a copy to avoid modifying the input candidate_entity_prior = copy.deepcopy( entity_candidates['candidate_entity_priors'] ) max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) candidate_fields = { "candidate_entity_priors": ArrayField(np_prior, dtype=self.dtype), "candidate_entities": TextField( [Token(" ".join(candidate_list)) for candidate_list in entity_candidates["candidate_entities"]], token_indexers={'ids': self._entity_indexers[key]}), "candidate_spans": ListField( [SpanField(span[0], span[1], fields['tokens']) for span in entity_candidates['candidate_spans']] ), "candidate_segment_ids": ArrayField( np.array(entity_candidates['candidate_segment_ids']), dtype=np.int ) } all_candidates[key] = DictField(candidate_fields) fields["candidates"] = DictField(all_candidates) return fields
Example #22
Source File: test_dict_field.py From kb with Apache License 2.0 | 4 votes |
def setUp(self): super(TestDictField, self).setUp() entity_tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) self.vocab = Vocabulary() self.vocab.add_token_to_namespace("entity1", "entity") self.vocab.add_token_to_namespace("entity2", "entity") self.vocab.add_token_to_namespace("entity3", "entity") self.entity_indexer = {"entity": TokenCharactersIndexerTokenizer( "entity", character_tokenizer=entity_tokenizer) } tokens1 = "The sentence .".split() tokens_field = TextField( [Token(t) for t in tokens1], token_indexers={'tokens': SingleIdTokenIndexer()} ) self.instance1_fields = { "candidate_entities": TextField( [Token("entity1 entity2"), Token("entity_unk")], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array([[0.5, 0.5], [1.0, 0.0]])), "candidate_spans": ListField( [SpanField(0, 0, tokens_field), SpanField(1, 2, tokens_field)] ) } tokens2 = "The sentence".split() tokens2_field = TextField( [Token(t) for t in tokens2], token_indexers={'tokens': SingleIdTokenIndexer()} ) self.instance2_fields = { "candidate_entities": TextField( [Token("entity1")], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array([[1.0]])), "candidate_spans": ListField( [SpanField(1, 1, tokens2_field)], ) }