Python allennlp.data.fields.ArrayField() Examples
The following are 29
code examples of allennlp.data.fields.ArrayField().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.fields
, or try the search function
.
Example #1
Source File: vampire_reader.py From vampire with Apache License 2.0 | 6 votes |
def text_to_instance(self, vec: str = None) -> Instance: # type: ignore """ Parameters ---------- text : ``str``, required. The text to classify label ``str``, optional, (default = None). The label for this text. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence or phrase. label : ``LabelField`` The label label of the sentence or phrase. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} fields['tokens'] = ArrayField(vec) return Instance(fields)
Example #2
Source File: array_field_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_padding_handles_list_fields(self): array1 = ArrayField(numpy.ones([2, 3])) array2 = ArrayField(numpy.ones([1, 5])) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = ( list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() ) correct_tensor = numpy.array( [ [[1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0]], [[1.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0]], [[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0]], ] ) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Example #3
Source File: array_field_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_padding_handles_list_fields_with_padding_values(self): array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1) array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = ( list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() ) correct_tensor = numpy.array( [ [[1.0, 1.0, 1.0, -1.0, -1.0], [1.0, 1.0, 1.0, -1.0, -1.0]], [[1.0, 1.0, 1.0, 1.0, 1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]], [[-1.0, -1.0, -1.0, -1.0, -1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]], ] ) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Example #4
Source File: array_field_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_alternative_dtypes(self): shape = [3, 4, 5, 6] array = numpy.zeros(shape) # Setting dtype to numpy.int64 should produce a torch.LongTensor when field is converted to # a tensor array_field1 = ArrayField(array, dtype=numpy.int64) returned_tensor1 = array_field1.as_tensor(array_field1.get_padding_lengths()) assert returned_tensor1.dtype == torch.int64 # Setting dtype to numpy.uint8 should produce a torch.ByteTensor when field is converted to # a tensor array_field2 = ArrayField(array, dtype=numpy.uint8) returned_tensor2 = array_field2.as_tensor(array_field2.get_padding_lengths()) assert returned_tensor2.dtype == torch.uint8 # Padding should not affect dtype padding_lengths = {"dimension_" + str(i): 10 for i, _ in enumerate(shape)} padded_tensor = array_field2.as_tensor(padding_lengths) assert padded_tensor.dtype == torch.uint8 # Empty fields should have the same dtype empty_field = array_field2.empty_field() assert empty_field.dtype == array_field2.dtype
Example #5
Source File: array_field_test.py From magnitude with MIT License | 5 votes |
def test_padding_handles_list_fields(self): array1 = ArrayField(numpy.ones([2, 3])) array2 = ArrayField(numpy.ones([1, 5])) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() correct_tensor = numpy.array([[[1., 1., 1., 0., 0.], [1., 1., 1., 0., 0.]], [[1., 1., 1., 1., 1.], [0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.]]]) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Example #6
Source File: test_dict_field.py From kb with Apache License 2.0 | 5 votes |
def test_list_field_of_dict_field(self): from allennlp.data import Instance from allennlp.data.iterators import BasicIterator tokens3 = "The long sentence .".split() tokens3_field = TextField( [Token(t) for t in tokens3], token_indexers={'tokens': SingleIdTokenIndexer()} ) instance3_fields = { "candidate_entities": TextField( [Token("entity1 entity2 entity3"), Token("entity_unk"), Token("entity2 entity3")], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array([[0.1, 0.1, 0.8], [1.0, 0.0, 0.0], [0.33, 0.67, 0.0]])), "candidate_spans": ListField( [SpanField(1, 1, tokens3_field), SpanField(1, 2, tokens3_field), SpanField(1, 3, tokens3_field)], ) } iterator = BasicIterator() iterator.index_with(self.vocab) instances = [Instance({"candidates": ListField([ DictField(self.instance1_fields), DictField(self.instance2_fields)])}), Instance({"candidates": ListField([ DictField(self.instance1_fields), DictField(instance3_fields)])}) ] for batch in iterator(instances, num_epochs=1, shuffle=False): pass self.assertTrue(batch['candidates']['candidate_entities']['entity'].shape == batch['candidates']['candidate_entity_prior'].shape)
Example #7
Source File: ultra_fine_reader.py From kb with Apache License 2.0 | 5 votes |
def text_to_instance(self, sentence, span, labels, index_entity_start): token_candidates = self.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sentence, span) fields = self.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates) fields['label_ids'] = ArrayField(np.array(labels), dtype=np.int) # index of entity start if index_entity_start is not None: offsets = [1] + token_candidates['offsets_a'][:-1] idx1_offset = offsets[index_entity_start] fields['index_a'] = LabelField(idx1_offset, skip_indexing=True) return Instance(fields)
Example #8
Source File: single_correct_mcq_entailment.py From multee with Apache License 2.0 | 5 votes |
def text_to_instance(self, # pylint: disable=arguments-differ premises: List[str], hypotheses: List[str], answer_index: int = None, relevant_sentence_idxs: List[int] = None) -> Instance: fields = {} premises_tokens = [self._tokenizer.tokenize(premise)[-self._premise_max_tokens:] for premise in premises] hypotheses_tokens = [self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:] for hypothesis in hypotheses] if premises: premises_text_fields = [TextField(premise_tokens, self._token_indexers) for premise_tokens in premises_tokens] premises_field = ListField(premises_text_fields) else: empty_stub = ListField([TextField([Token('dummy')], self._token_indexers)]) premises_field = empty_stub.empty_field() fields['premises'] = premises_field hypotheses_text_fields = [TextField(hypothesis_tokens, self._token_indexers) for hypothesis_tokens in hypotheses_tokens] hypotheses_field = ListField(hypotheses_text_fields) fields['hypotheses'] = hypotheses_field # If sentence relevance is available if relevant_sentence_idxs is not None: relevance_presence_mask = np.zeros(len(premises)) for idx in relevant_sentence_idxs: relevance_presence_mask[idx] = 1 fields['relevance_presence_mask'] = ArrayField(np.array(relevance_presence_mask)) # If entailment labels are available if answer_index is not None: if answer_index not in range(0, len(hypotheses)): raise ConfigurationError("Provided label must be in 0 to {}".format(len(hypotheses))) fields['answer_index'] = ArrayField(np.array(answer_index), padding_value=-1, dtype=np.long) paragraph_tokens = [token for premise_tokens in premises_tokens for token in premise_tokens] paragraph_text_field = TextField(paragraph_tokens, self._token_indexers) fields['paragraph'] = paragraph_text_field return Instance(fields)
Example #9
Source File: array_field_test.py From magnitude with MIT License | 5 votes |
def test_printing_doesnt_crash(self): array = ArrayField(numpy.ones([2, 3]), padding_value=-1) print(array)
Example #10
Source File: array_field_test.py From magnitude with MIT License | 5 votes |
def test_padding_handles_list_fields_with_padding_values(self): array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1) array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() correct_tensor = numpy.array([[[1., 1., 1., -1., -1.], [1., 1., 1., -1., -1.]], [[1., 1., 1., 1., 1.], [-1., -1., -1., -1., -1.]], [[-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.]]]) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Example #11
Source File: dataset_reader.py From ConvLab with MIT License | 5 votes |
def text_to_instance(self, state: np.ndarray, action: int = None) -> Instance: # type: ignore """ """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} fields["states"] = ArrayField(state) if action is not None: fields["actions"] = LabelField(action, skip_indexing=True) return Instance(fields)
Example #12
Source File: array_field_test.py From magnitude with MIT License | 5 votes |
def test_as_tensor_handles_larger_padding_dimensions(self): shape = [3, 4] array = numpy.ones(shape) array_field = ArrayField(array) padded_tensor = array_field.as_tensor({u"dimension_0": 5, u"dimension_1": 6}).detach().cpu().numpy() numpy.testing.assert_array_equal(padded_tensor[:3, :4], array) numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.)
Example #13
Source File: array_field_test.py From magnitude with MIT License | 5 votes |
def test_get_padding_lengths_correctly_returns_ordered_shape(self): shape = [3, 4, 5, 6] array = numpy.zeros(shape) array_field = ArrayField(array) lengths = array_field.get_padding_lengths() for i in range(len(lengths)): assert lengths[u"dimension_{}".format(i)] == shape[i]
Example #14
Source File: array_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_eq(self): array1 = ArrayField(numpy.asarray([1, 1, 1])) array2 = ArrayField(numpy.asarray([[1, 1, 1], [1, 1, 1]])) array3 = ArrayField(numpy.asarray([1, 1, 2])) array4 = ArrayField(numpy.asarray([1, 1, 1])) assert array1 != array2 assert array1 != array3 assert array1 == array4
Example #15
Source File: array_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_len_works_with_scalar(self): array = ArrayField(numpy.asarray(42)) assert len(array) == 1
Example #16
Source File: array_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_as_tensor_with_scalar_keeps_dtype(self): array = ArrayField(numpy.asarray(42, dtype=numpy.float32)) returned_tensor = array.as_tensor(array.get_padding_lengths()) assert returned_tensor.dtype == torch.float32
Example #17
Source File: array_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_printing_doesnt_crash(self): array = ArrayField(numpy.ones([2, 3]), padding_value=-1) print(array)
Example #18
Source File: array_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_as_tensor_handles_larger_padding_dimensions(self): shape = [3, 4] array = numpy.ones(shape) array_field = ArrayField(array) padded_tensor = ( array_field.as_tensor({"dimension_0": 5, "dimension_1": 6}).detach().cpu().numpy() ) numpy.testing.assert_array_equal(padded_tensor[:3, :4], array) numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.0)
Example #19
Source File: array_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_get_padding_lengths_correctly_returns_ordered_shape(self): shape = [3, 4, 5, 6] array = numpy.zeros(shape) array_field = ArrayField(array) lengths = array_field.get_padding_lengths() for i in range(len(lengths)): assert lengths["dimension_{}".format(i)] == shape[i]
Example #20
Source File: arc_multichoice_with_facts_text_json_reader_multi_source.py From OpenBookQA with Apache License 2.0 | 4 votes |
def text_to_instance(self, # type: ignore item_id: Any, question_text: str, choice_text_list: List[str], facts_text_list: List[str], question2facts_mapping: List[float], choice2facts_mapping: List[List[float]], answer_id: int, meta_fields: Dict = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self.tokenize(question_text, "question") choices_tokens_list = [self.tokenize(x, "choice") for x in choice_text_list] facts_tokens_list = [self.tokenize(x, "fact") for x in facts_text_list] fields['question'] = TextField(question_tokens, self._token_indexers) fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list]) fields['facts_list'] = ListField([TextField(x, self._token_indexers) for x in facts_tokens_list]) fields['question2facts_map'] = ArrayField(np.asarray(question2facts_mapping)) fields['choice2facts_map'] = ArrayField(np.asarray(choice2facts_mapping)) fields['label'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "choice_text_list": choice_text_list, "facts_text_list": facts_text_list, "question_tokens": [x.text for x in question_tokens], "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list], "facts_tokens_list": [[x.text for x in ct] for ct in facts_tokens_list], "label_gold": answer_id, } if meta_fields is not None: for k, v in meta_fields.items(): metadata[k] = v fields["metadata"] = MetadataField(metadata) return Instance(fields)
Example #21
Source File: summarization_reader.py From summarus with Apache License 2.0 | 4 votes |
def text_to_instance(self, source: str, target: str = None) -> Instance: def prepare_text(text, max_tokens): text = text.lower() if self._lowercase else text tokens = self._tokenizer.tokenize(text)[:max_tokens] tokens.insert(0, Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) return tokens source_tokens = prepare_text(source, self._source_max_tokens) source_tokens_indexed = TextField(source_tokens, self._source_token_indexers) result = {'source_tokens': source_tokens_indexed} meta_fields = {} if self._save_copy_fields: source_to_target_field = NamespaceSwappingField(source_tokens[1:-1], self._target_namespace) result["source_to_target"] = source_to_target_field meta_fields["source_tokens"] = [x.text for x in source_tokens[1:-1]] if self._save_pgn_fields: source_to_target_field = NamespaceSwappingField(source_tokens, self._target_namespace) result["source_to_target"] = source_to_target_field meta_fields["source_tokens"] = [x.text for x in source_tokens] if target: target_tokens = prepare_text(target, self._target_max_tokens) target_tokens_indexed = TextField(target_tokens, self._target_token_indexers) result['target_tokens'] = target_tokens_indexed if self._save_pgn_fields: meta_fields["target_tokens"] = [y.text for y in target_tokens] source_and_target_token_ids = self._tokens_to_ids(source_tokens + target_tokens, self._lowercase) source_token_ids = source_and_target_token_ids[:len(source_tokens)] result["source_token_ids"] = ArrayField(np.array(source_token_ids, dtype='long')) target_token_ids = source_and_target_token_ids[len(source_tokens):] result["target_token_ids"] = ArrayField(np.array(target_token_ids, dtype='long')) if self._save_copy_fields: meta_fields["target_tokens"] = [y.text for y in target_tokens[1:-1]] source_and_target_token_ids = self._tokens_to_ids(source_tokens[1:-1] + target_tokens, self._lowercase) source_token_ids = source_and_target_token_ids[:len(source_tokens)-2] result["source_token_ids"] = ArrayField(np.array(source_token_ids)) target_token_ids = source_and_target_token_ids[len(source_tokens)-2:] result["target_token_ids"] = ArrayField(np.array(target_token_ids)) elif self._save_copy_fields: source_token_ids = self._tokens_to_ids(source_tokens[1:-1], self._lowercase) result["source_token_ids"] = ArrayField(np.array(source_token_ids)) elif self._save_pgn_fields: source_token_ids = self._tokens_to_ids(source_tokens, self._lowercase) result["source_token_ids"] = ArrayField(np.array(source_token_ids)) if self._save_copy_fields or self._save_pgn_fields: result["metadata"] = MetadataField(meta_fields) return Instance(result)
Example #22
Source File: citation_data_reader_aclarc.py From scicite with Apache License 2.0 | 4 votes |
def text_to_instance(self, citation_text: str, citing_paper_id: str, cited_paper_id: str, intent: List[str] = None, citing_paper_title: str = None, cited_paper_title: str = None, citing_paper_year: int = None, cited_paper_year: int = None, citing_author_ids: List[str] = None, cited_author_ids: List[str] = None, extended_context: str = None, section_number: int = None, section_title: str = None, sents_before: List[str] = None, sents_after: List[str] = None, cite_marker_begin: int = None, cite_marker_end: int = None, cleaned_cite_text: str = None, citation_excerpt_index: str = None, citation_id: str = None, venue: str = None) -> Instance: # type: ignore citation_tokens = self._tokenizer.tokenize(citation_text) # tok_cited_title = self._tokenizer.tokenize(cited_paper_title) # tok_citing_title = self._tokenizer.tokenize(citing_paper_title) # tok_extended_context = self._tokenizer.tokenize(extended_context) fields = { 'citation_text': TextField(citation_tokens, self._token_indexers), } if self.use_sparse_lexicon_features: # convert to regular string sent = [token.text.lower() for token in citation_tokens] lexicon_features, _ = is_in_lexicon(self.lexicons, sent) fields["lexicon_features"] = ListField([LabelField(feature, skip_indexing=True) for feature in lexicon_features]) if intent is not None: fields['labels'] = LabelField(intent) if citing_paper_year and cited_paper_year and \ citing_paper_year > -1 and cited_paper_year > -1: year_diff = citing_paper_year - cited_paper_year else: year_diff = -1 fields['year_diff'] = ArrayField(torch.Tensor([year_diff])) fields['citing_paper_id'] = MetadataField(citing_paper_id) fields['cited_paper_id'] = MetadataField(cited_paper_id) fields['citation_excerpt_index'] = MetadataField(citation_excerpt_index) fields['citation_id'] = MetadataField(citation_id) return Instance(fields)
Example #23
Source File: citation_data_reader_scicite.py From scicite with Apache License 2.0 | 4 votes |
def text_to_instance(self, citation_text: str, citing_paper_id: str, cited_paper_id: str, intent: List[str] = None, citing_paper_title: str = None, cited_paper_title: str = None, citing_paper_year: int = None, cited_paper_year: int = None, citing_author_ids: List[str] = None, cited_author_ids: List[str] = None, extended_context: str = None, section_number: int = None, section_title: str = None, cite_marker_begin: int = None, cite_marker_end: int = None, sents_before: List[str] = None, sents_after: List[str] = None, cleaned_cite_text: str = None, citation_excerpt_index: str = None, venue: str = None) -> Instance: # type: ignore citation_tokens = self._tokenizer.tokenize(citation_text) fields = { 'citation_text': TextField(citation_tokens, self._token_indexers), } if self.use_sparse_lexicon_features: # convert to regular string sent = [token.text.lower() for token in citation_tokens] lexicon_features, _ = is_in_lexicon(self.lexicons, sent) fields["lexicon_features"] = ListField([LabelField(feature, skip_indexing=True) for feature in lexicon_features]) if intent: if self.multilabel: fields['labels'] = MultiLabelField([S2_CATEGORIES[e] for e in intent], skip_indexing=True, num_labels=len(S2_CATEGORIES)) else: if not isinstance(intent, str): raise TypeError(f"Undefined label format. Should be a string. Got: f'{intent}'") fields['labels'] = LabelField(intent) if citing_paper_year and cited_paper_year and \ citing_paper_year > -1 and cited_paper_year > -1: year_diff = citing_paper_year - cited_paper_year else: year_diff = -1 fields['year_diff'] = ArrayField(torch.Tensor([year_diff])) fields['citing_paper_id'] = MetadataField(citing_paper_id) fields['cited_paper_id'] = MetadataField(cited_paper_id) fields['citation_excerpt_index'] = MetadataField(citation_excerpt_index) fields['citation_id'] = MetadataField(f"{citing_paper_id}>{cited_paper_id}") return Instance(fields)
Example #24
Source File: bert_tokenizer_and_candidate_generator.py From kb with Apache License 2.0 | 4 votes |
def convert_tokens_candidates_to_fields(self, tokens_and_candidates): """ tokens_and_candidates is the return from a previous call to generate_sentence_entity_candidates. Converts the dict to a dict of fields usable with allennlp. """ fields = {} fields['tokens'] = TextField( [Token(t, text_id=self.bert_tokenizer.vocab[t]) for t in tokens_and_candidates['tokens']], token_indexers=self._bert_single_id_indexer ) fields['segment_ids'] = ArrayField( np.array(tokens_and_candidates['segment_ids']), dtype=np.int ) all_candidates = {} for key, entity_candidates in tokens_and_candidates['candidates'].items(): # pad the prior to create the array field # make a copy to avoid modifying the input candidate_entity_prior = copy.deepcopy( entity_candidates['candidate_entity_priors'] ) max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) candidate_fields = { "candidate_entity_priors": ArrayField(np_prior, dtype=self.dtype), "candidate_entities": TextField( [Token(" ".join(candidate_list)) for candidate_list in entity_candidates["candidate_entities"]], token_indexers={'ids': self._entity_indexers[key]}), "candidate_spans": ListField( [SpanField(span[0], span[1], fields['tokens']) for span in entity_candidates['candidate_spans']] ), "candidate_segment_ids": ArrayField( np.array(entity_candidates['candidate_segment_ids']), dtype=np.int ) } all_candidates[key] = DictField(candidate_fields) fields["candidates"] = DictField(all_candidates) return fields
Example #25
Source File: wordnet.py From kb with Apache License 2.0 | 4 votes |
def text_to_instance(self, tokens: List[str], candidate_entities: List[List[str]], candidate_spans: List[List[int]], candidate_entity_prior: List[List[float]], gold_entities: List[str] = None, gold_data_ids: List[str] = None): # prior needs to be 2D and full # can look like [[0.2, 0.8], [1.0]] if one candidate for second # candidate span and two candidates for first max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) fields = { "tokens": TextField([Token(t) for t in tokens], token_indexers=self.token_indexers), # join by space, then retokenize in the "character indexer" "candidate_entities": TextField( [Token(" ".join(candidate_list)) for candidate_list in candidate_entities], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array(np_prior)), # only one sentence "candidate_segment_ids": ArrayField( np.array([0] * len(candidate_entities)), dtype=np.int ) } if gold_entities is not None: fields["gold_entities"] = TextField([Token(entity) for entity in gold_entities], token_indexers=self.entity_indexer) if gold_data_ids is not None: fields["gold_data_ids"] = MetadataField(gold_data_ids) span_fields = [] for span in candidate_spans: span_fields.append(SpanField(span[0], span[1], fields['tokens'])) fields['candidate_spans'] = ListField(span_fields) if self.extra_candidate_generators: tokens = " ".join(tokens) extra_candidates = { key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True) for key, generator in self.extra_candidate_generators.items() } fields['extra_candidates'] = MetadataField(extra_candidates) return Instance(fields, should_remap_span_indices=self.should_remap_span_indices)
Example #26
Source File: kg_probe_reader.py From kb with Apache License 2.0 | 4 votes |
def text_to_instance(self, sentence: str, span: Tuple[int, ...]): token_candidates = self._tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sentence) # NOTE: Skipping the padding here since sentences are all quite short. vocab = self._tokenizer_and_candidate_generator.bert_tokenizer.vocab lm_label_ids = TextField( [Token(t, text_id=vocab[t]) for t in token_candidates['tokens']], token_indexers=self._label_indexer ) # We need to offset the start and end of the span so that it aligns with word pieces. if span[0] == 0: start = 1 # Since 0'th elt. is <CLS> else: start = token_candidates['offsets_a'][span[0] - 1] end = token_candidates['offsets_a'][span[1]] masked_tokens: List[str] = token_candidates['tokens'].copy() mask_indicator = np.zeros(len(masked_tokens), dtype=np.uint8) for i in range(start, end): masked_tokens[i] = '[MASK]' mask_indicator[i] = 1 token_candidates['tokens'] = masked_tokens # mask out the entity candidates candidates = token_candidates['candidates'] for candidate_key in candidates.keys(): indices_to_mask = [] for k, candidate_span in enumerate(candidates[candidate_key]['candidate_spans']): # (end-1) as candidate spans are exclusive (e.g. candidate_span = (0, 0) has start=0, end=1) if (candidate_span[0] >= start and candidate_span[0] <= end-1) or ( candidate_span[1] >= start and candidate_span[1] <= end-1): indices_to_mask.append(k) for ind in indices_to_mask: candidates[candidate_key]['candidate_entities'][ind] = ['@@MASK@@'] candidates[candidate_key]['candidate_entity_priors'][ind] = [1.0] fields = self._tokenizer_and_candidate_generator. \ convert_tokens_candidates_to_fields(token_candidates) fields['lm_label_ids'] = lm_label_ids fields['mask_indicator'] = ArrayField(mask_indicator, dtype=np.uint8) return Instance(fields)
Example #27
Source File: wiki_linking_reader.py From kb with Apache License 2.0 | 4 votes |
def text_to_instance(self, tokenized_text: List[str], candidate_entities: List[List[str]], candidate_spans: List[List[int]], candidate_entity_prior: List[List[float]], gold_entities: List[str] = None, doc_id: str = None): assert doc_id is not None token_field = TextField([Token(x) for x in tokenized_text], self.token_indexers) span_fields = ListField([SpanField(*span, token_field) for span in candidate_spans]) candidate_entities = TextField( [Token(" ".join(candidate_list)) for candidate_list in candidate_entities], token_indexers=self.entity_indexer) max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) prior_field = ArrayField(np_prior) # only one segment candidate_segment_ids = ArrayField( np.array([0] * len(candidate_entities)), dtype=np.int ) fields = { "tokens": token_field, "candidate_spans": span_fields, "candidate_entities": candidate_entities, "candidate_entity_prior": prior_field, "candidate_segment_ids": candidate_segment_ids } if gold_entities: labels = TextField([Token(entity) for entity in gold_entities], token_indexers=self.entity_indexer) fields["gold_entities"] = labels fields["doc_id"] = MetadataField(doc_id) if self.extra_candidate_generators: tokens = " ".join(tokenized_text) extra_candidates = { key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True) for key, generator in self.extra_candidate_generators.items() } fields['extra_candidates'] = MetadataField(extra_candidates) return Instance(fields, should_remap_span_indices=self.should_remap_span_indices)
Example #28
Source File: multiple_correct_mcq_entailment.py From multee with Apache License 2.0 | 4 votes |
def text_to_instance(self, # pylint: disable=arguments-differ premises: List[str], hypotheses: List[str], answer_indices: List[int] = None, relevant_sentence_idxs: List[int] = None) -> Instance: fields = {} premises_tokens = [self._tokenizer.tokenize(premise)[-self._premise_max_tokens:] for premise in premises] hypotheses_tokens = [self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:] for hypothesis in hypotheses] if premises: premises_text_fields = [TextField(premise_tokens, self._token_indexers) for premise_tokens in premises_tokens] premises_field = ListField(premises_text_fields) else: empty_stub = ListField([TextField([Token('dummy')], self._token_indexers)]) premises_field = empty_stub.empty_field() fields['premises'] = premises_field hypotheses_text_fields = [TextField(hypothesis_tokens, self._token_indexers) for hypothesis_tokens in hypotheses_tokens] hypotheses_field = ListField(hypotheses_text_fields) fields['hypotheses'] = hypotheses_field # If sentence relevance is available if relevant_sentence_idxs is not None: relevance_presence_mask = np.zeros(len(premises)) for idx in relevant_sentence_idxs: relevance_presence_mask[idx] = 1 fields['relevance_presence_mask'] = ArrayField(np.array(relevance_presence_mask)) # If answer_indices labels are available if answer_indices is not None: answer_correctness_mask = np.zeros(len(hypotheses)) for answer_index in answer_indices: answer_correctness_mask[answer_index] = 1 fields['answer_correctness_mask'] = ArrayField(answer_correctness_mask, padding_value=-1, dtype=np.long) paragraph_tokens = [token for premise_tokens in premises_tokens for token in premise_tokens] paragraph_text_field = TextField(paragraph_tokens, self._token_indexers) fields['paragraph'] = paragraph_text_field return Instance(fields)
Example #29
Source File: fever_reader_with_wn.py From combine-FEVER-NSMN with MIT License | 4 votes |
def text_to_instance(self, # type: ignore premise: str, hypothesis: str, pid: str = None, label: str = None) -> Instance: fields: Dict[str, Field] = {} premise_tokens = [Token(t) for t in premise.split(' ')] # Removing code for parentheses in NLI hypothesis_tokens = [Token(t) for t in hypothesis.split(' ')] if self.max_l is not None: premise_tokens = premise_tokens[:self.max_l] hypothesis_tokens = hypothesis_tokens[:self.max_l] fields['premise'] = TextField(premise_tokens, self._token_indexers) fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) # WN feature dict: premise_s = premise.split(' ') hypothesis_s = hypothesis.split(' ') if self.max_l is not None: premise_s = premise_s[:self.max_l] hypothesis_s = hypothesis_s[:self.max_l] example_feature = wn_persistent_api.compute_wn_features_p_accerate(premise_s, hypothesis_s, self.wn_p_dict) p_wn_nparray, h_wn_nparray = wn_persistent_api.wn_raw_feature_to_nparray( example_feature, self.wn_feature_list) assert len(premise_tokens) == p_wn_nparray.shape[0] assert len(hypothesis_tokens) == h_wn_nparray.shape[0] fields['p_wn_feature'] = ArrayField(p_wn_nparray) fields['h_wn_feature'] = ArrayField(h_wn_nparray) if label: fields['label'] = LabelField(label, label_namespace='labels') if pid: fields['pid'] = IdField(pid) return Instance(fields)