Python allennlp.data.fields.MetadataField() Examples
The following are 30
code examples of allennlp.data.fields.MetadataField().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.fields
, or try the search function
.
Example #1
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None, intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence if tags: fields["tags"] = SequenceLabelField(tags, sequence) if domain: fields["domain"] = LabelField(domain, label_namespace="domain_labels") if intent: fields["intent"] = LabelField(intent, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #2
Source File: arc_multichoice_json_reader.py From ARC-Solvers with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore item_id: Any, question_text: str, choice_text_list: List[str], answer_id: int) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self._tokenizer.tokenize(question_text) choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list] fields['question'] = TextField(question_tokens, self._token_indexers) fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list]) fields['label'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "choice_text_list": choice_text_list, "question_tokens": [x.text for x in question_tokens], "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list], } fields["metadata"] = MetadataField(metadata) return Instance(fields)
Example #3
Source File: ir_single_sequence_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def text_to_instance(self, seq_id:str, seq_text:str) -> Instance: # type: ignore # pylint: disable=arguments-differ seq_id_field = MetadataField(seq_id) seq_tokenized = self._tokenizer.tokenize(seq_text) if self.max_seq_length > -1: seq_tokenized = seq_tokenized[:self.max_seq_length] if self.min_seq_length > -1 and len(seq_tokenized) < self.min_seq_length: seq_tokenized = seq_tokenized + [self.padding_value] * (self.min_seq_length - len(seq_tokenized)) seq_field = TextField(seq_tokenized, self._token_indexers) return Instance({ "seq_id":seq_id_field, "seq_tokens":seq_field})
Example #4
Source File: bert_labeled_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 6 votes |
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_id_field = MetadataField(query_id) doc_id_field = MetadataField(doc_id) query_tokenized = self._tokenizer.tokenize(query_sequence) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] if self.min_query_length > -1 and len(query_tokenized) < self.min_query_length: query_tokenized = query_tokenized + [self.padding_value] * (self.min_query_length - len(query_tokenized)) doc_tokenized = self._tokenizer.tokenize(doc_sequence) if self.max_doc_length > -1: doc_tokenized = doc_tokenized[:self.max_doc_length] if self.min_doc_length > -1 and len(doc_tokenized) < self.min_doc_length: doc_tokenized = doc_tokenized + [self.padding_value] * (self.min_doc_length - len(doc_tokenized)) doc_field = TextField(query_tokenized + [self.sep_value] + doc_tokenized, self._token_indexers) return Instance({ "query_id":query_id_field, "doc_id":doc_id_field, "doc_tokens":doc_field})
Example #5
Source File: citation_data_reader_scicite_aux.py From scicite with Apache License 2.0 | 6 votes |
def text_to_instance(self, citation_text: str, citing_paper_id: str, cited_paper_id: str, intent: List[str] = None, section_name: str = None) -> Instance: citation_tokens = self._tokenizer.tokenize(citation_text) fields = { 'citation_text': TextField(citation_tokens, self._token_indexers), } if section_name is not None: fields['section_label'] = LabelField(section_name, label_namespace="section_labels") fields['citing_paper_id'] = MetadataField(citing_paper_id) fields['cited_paper_id'] = MetadataField(cited_paper_id) return Instance(fields)
Example #6
Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0 | 6 votes |
def text_to_instance(self, citation_text: str, citing_paper_id: str, cited_paper_id: str, intent: List[str] = None, cleaned_cite_text: str = None, section_name: str = None, is_citation: bool = None) -> Instance: citation_tokens = self._tokenizer.tokenize(citation_text) fields = { 'citation_text': TextField(citation_tokens, self._token_indexers), } if is_citation is not None: fields['is_citation'] = LabelField(str(is_citation), label_namespace="cite_worthiness_labels") fields['citing_paper_id'] = MetadataField(citing_paper_id) fields['cited_paper_id'] = MetadataField(cited_paper_id) return Instance(fields)
Example #7
Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0 | 6 votes |
def text_to_instance(self, citation_text: str, citing_paper_id: str, cited_paper_id: str, intent: List[str] = None, venue: str = None, section_name: str = None) -> Instance: citation_tokens = self._tokenizer.tokenize(citation_text) fields = { 'citation_text': TextField(citation_tokens, self._token_indexers), } if section_name is not None: fields['section_label'] = LabelField(section_name, label_namespace="section_labels") fields['citing_paper_id'] = MetadataField(citing_paper_id) fields['cited_paper_id'] = MetadataField(cited_paper_id) return Instance(fields)
Example #8
Source File: entailment_pair.py From multee with Apache License 2.0 | 6 votes |
def text_to_instance(self, # pylint: disable=arguments-differ premise: str, hypothesis: str, label: str = None) -> Instance: fields: Dict[str, Field] = {} premise_tokens = [Token(token.text) for token in self._tokenizer.tokenize(premise)[-self._max_tokens:]] hypothesis_tokens = [Token(token.text) for token in self._tokenizer.tokenize(hypothesis)[-self._max_tokens:]] fields['premise'] = TextField(premise_tokens, self._token_indexers) fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) if label: fields['label'] = LabelField(label) # metadata = {"premise_tokens": [x.text for x in premise_tokens], # "hypothesis_tokens": [x.text for x in hypothesis_tokens]} # fields["metadata"] = MetadataField(metadata) return Instance(fields)
Example #9
Source File: snli.py From magnitude with MIT License | 6 votes |
def text_to_instance(self, # type: ignore premise , hypothesis , label = None) : # pylint: disable=arguments-differ fields = {} premise_tokens = self._tokenizer.tokenize(premise) hypothesis_tokens = self._tokenizer.tokenize(hypothesis) fields[u'premise'] = TextField(premise_tokens, self._token_indexers) fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) if label: fields[u'label'] = LabelField(label) metadata = {u"premise_tokens": [x.text for x in premise_tokens], u"hypothesis_tokens": [x.text for x in hypothesis_tokens]} fields[u"metadata"] = MetadataField(metadata) return Instance(fields)
Example #10
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None, intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # print([t.text for t in context_tokens]) fields["context_tokens"] = TextField(context_tokens, self._token_indexers) fields["tokens"] = TextField(tokens, self._token_indexers) fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) if tags is not None: fields["tags"] = SequenceLabelField(tags, fields["tokens"]) if intents is not None: fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #11
Source File: interleaving_dataset_reader.py From allennlp with Apache License 2.0 | 5 votes |
def _read_all_at_once(self, datasets: Mapping[str, Iterable[Instance]]) -> Iterable[Instance]: for key, dataset in datasets.items(): for instance in dataset: instance.fields[self._dataset_field_name] = MetadataField(key) yield instance
Example #12
Source File: depend_parse.py From glyce with Apache License 2.0 | 5 votes |
def text_to_instance(self, # type: ignore words: List[str], upos_tags: List[str], dependencies: List[Tuple[str, int]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- words : ``List[str]``, required. The words in the sentence to be encoded. upos_tags : ``List[str]``, required. The universal dependencies POS tags for each word. dependencies ``List[Tuple[str, int]]``, optional (default = None) A list of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. Returns ------- An instance containing words, upos tags, dependency head tags and head indices as fields. """ fields: Dict[str, Field] = {} tokens = TextField([Token(w) for w in words], self._token_indexers) fields["words"] = tokens fields["pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace="pos") if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies], tokens, label_namespace="head_tags") fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies], tokens, label_namespace="head_index_tags") fields["metadata"] = MetadataField({"words": words, "pos": upos_tags}) return Instance(fields)
Example #13
Source File: open_nre_nyt_reader.py From DISTRE with Apache License 2.0 | 5 votes |
def text_to_instance(self, # type: ignore sentence: str, head: str, tail: str, head_type: str=None, tail_type: str=None, label: str=None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} instance_id = f'{head}#{tail}' if label: instance_id = f'{instance_id}#{label}' fields['metadata'] = MetadataField({'instance_id': instance_id.lower()}) tokens = self._token_splitter.split_words(sentence) head = self._token_splitter.split_words(head) tail = self._token_splitter.split_words(tail) # TODO: this should not be done here if self._masking_mode == 'ner_least_specific': logger.info(f"Using masking mode 'ner_least_specific'.") tokens = ([Token('__start__')] + head + [Token('__del1__')] + head_type + [Token('__ent1__')] + tail + [Token('__del2__')] + tail_type + [Token('__ent2__')] + tokens + [Token('__clf__')]) else: tokens = [Token('__start__')] + head + [Token('__del1__')] + tail + [Token('__del2__')] + tokens + [Token('__clf__')] fields['sentence'] = TextField(tokens, self._token_indexers) if label: fields['label'] = LabelField(label) return Instance(fields)
Example #14
Source File: ebmnlp.py From scibert with Apache License 2.0 | 5 votes |
def text_to_instance(self, tokens: List[Token], pico_tags: List[str] = None): sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) # Set the field 'labels' according to the specified PIO element if pico_tags is not None: instance_fields['tags'] = SequenceLabelField(pico_tags, sequence, self.label_namespace) return Instance(instance_fields)
Example #15
Source File: ir_labeled_tuple_loader.py From transformer-kernel-ranking with Apache License 2.0 | 5 votes |
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_id_field = MetadataField(query_id) doc_id_field = MetadataField(doc_id) query_tokenized = self._tokenizer.tokenize(query_sequence) #if self._source_add_start_token: # query_tokenized.insert(0, Token(START_SYMBOL)) #query_tokenized.append(Token(END_SYMBOL)) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] if self.min_query_length > -1 and len(query_tokenized) < self.min_query_length: query_tokenized = query_tokenized + [self.padding_value] * (self.min_query_length - len(query_tokenized)) query_field = TextField(query_tokenized, self._token_indexers) doc_tokenized = self._tokenizer.tokenize(doc_sequence) #doc_tokenized.insert(0, Token(START_SYMBOL)) #doc_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_tokenized = doc_tokenized[:self.max_doc_length] if self.min_doc_length > -1 and len(doc_tokenized) < self.min_doc_length: doc_tokenized = doc_tokenized + [self.padding_value] * (self.min_doc_length - len(doc_tokenized)) doc_field = TextField(doc_tokenized, self._token_indexers) return Instance({ "query_id":query_id_field, "doc_id":doc_id_field, "query_tokens":query_field, "doc_tokens":doc_field})
Example #16
Source File: classification_dataset_reader.py From scibert with Apache License 2.0 | 5 votes |
def text_to_instance(self, text: str, label: str = None, metadata: Any = None) -> Instance: # type: ignore text_tokens = self._tokenizer.tokenize(text) fields = { 'text': TextField(text_tokens, self._token_indexers), } if label is not None: fields['label'] = LabelField(label) if metadata: fields['metadata'] = MetadataField(metadata) return Instance(fields)
Example #17
Source File: arc_multichoice_json_reader.py From OpenBookQA with Apache License 2.0 | 5 votes |
def text_to_instance(self, # type: ignore item_id: Any, question_text: str, choice_text_list: List[str], answer_id: int ) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self._tokenizer.tokenize(question_text) choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list] fields['question'] = TextField(question_tokens, self._token_indexers) fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list]) fields['label'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "choice_text_list": choice_text_list, "question_tokens": [x.text for x in question_tokens], "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list], } fields["metadata"] = MetadataField(metadata) return Instance(fields)
Example #18
Source File: datareader.py From NLP_Toolkit with Apache License 2.0 | 5 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, words: List[str] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence fields["metadata"] = MetadataField({"words": words}) if tags is not None: labels, detect_tags, complex_flag_dict = self.extract_tags(tags) if self._skip_complex and complex_flag_dict[self._skip_complex] > 0: return None rnd = random() # skip TN if self._skip_correct and all(x == "CORRECT" for x in detect_tags): if rnd > self._tn_prob: return None # skip TP else: if rnd > self._tp_prob: return None fields["labels"] = SequenceLabelField(labels, sequence, label_namespace="labels") fields["d_tags"] = SequenceLabelField(detect_tags, sequence, label_namespace="d_tags") return Instance(fields)
Example #19
Source File: bert_reader_sent_selection.py From semanticRetrievalMRS with MIT License | 5 votes |
def text_to_instance(self, # type: ignore sent1: str, # Important type information sent2: str, pid: str = None, label: str = None) -> Instance: fields: Dict[str, Field] = {} tokenized_text1 = self.bert_tokenizer.tokenize(sent1) tokenized_text2 = self.bert_tokenizer.tokenize(sent2) # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l) tokenized_text1 = tokenized_text1[:self.query_l] tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))] joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]'] text1_len = len(tokenized_text1) + 2 text2_len = len(tokenized_text2) + 1 segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64)) text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use) text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2)) fields['bert_s1_span'] = MetadataField(text1_span) fields['bert_s2_span'] = MetadataField(text2_span) if label: fields['label'] = LabelField(label, label_namespace='labels') if pid: fields['pid'] = IdField(pid) return Instance(fields)
Example #20
Source File: paired_span_pred_reader.py From semanticRetrievalMRS with MIT License | 5 votes |
def text_to_instance(self, # type: ignore example) -> Instance: fields: Dict[str, Field] = {} joint_tokens_seq = example['paired_c_tokens'] assert len(joint_tokens_seq) <= 512 segments_ids = example['segment_ids'] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64)) # This text span is begin inclusive and end exclusive. # text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use) # text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens'])) # fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence']) # fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence']) # fields['bert_s2_span'] = SpanField(text2_span) # fields['bert_s1_span'] = MetadataField(text1_span) # fields['bert_s2_span'] = MetadataField(text2_span) # However, the ground truth span is begin and end both inclusive fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence']) fields['fid'] = IdField(example['fid']) fields['uid'] = IdField(example['uid']) return Instance(fields)
Example #21
Source File: span_pred_reader.py From semanticRetrievalMRS with MIT License | 5 votes |
def text_to_instance(self, # type: ignore example) -> Instance: fields: Dict[str, Field] = {} joint_tokens_seq = ['[CLS]'] + example['query_c_tokens'] + ['[SEP]'] + example['context_c_tokens'] + ['[SEP]'] assert len(joint_tokens_seq) < 512 text1_len = len(example['query_c_tokens']) + 2 text2_len = len(example['context_c_tokens']) + 1 segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64)) # This text span is begin inclusive and end exclusive. text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use) text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens'])) fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence']) fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence']) # fields['bert_s2_span'] = SpanField(text2_span) # fields['bert_s1_span'] = MetadataField(text1_span) # fields['bert_s2_span'] = MetadataField(text2_span) # However, the ground truth span is begin and end both inclusive fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence']) fields['fid'] = IdField(example['fid']) fields['uid'] = IdField(example['uid']) return Instance(fields)
Example #22
Source File: sequence_tagging.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, tokens , tags = None) : # type: ignore u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields = {} sequence = TextField(tokens, self._token_indexers) fields[u"tokens"] = sequence fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]}) if tags is not None: fields[u"tags"] = SequenceLabelField(tags, sequence) return Instance(fields)
Example #23
Source File: interleaving_dataset_reader.py From allennlp with Apache License 2.0 | 5 votes |
def _read_round_robin(self, datasets: Mapping[str, Iterable[Instance]]) -> Iterable[Instance]: remaining = set(datasets) dataset_iterators = {key: iter(dataset) for key, dataset in datasets.items()} while remaining: for key, dataset in dataset_iterators.items(): if key in remaining: try: instance = next(dataset) instance.fields[self._dataset_field_name] = MetadataField(key) yield instance except StopIteration: remaining.remove(key)
Example #24
Source File: metadata_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_mapping_works_with_dict(self): field = MetadataField({"a": 1, "b": [0]}) assert "a" in field assert field["a"] == 1 assert len(field) == 2 keys = {k for k in field} assert keys == {"a", "b"} values = [v for v in field.values()] assert len(values) == 2 assert 1 in values assert [0] in values
Example #25
Source File: metadata_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_mapping_raises_with_non_dict(self): field = MetadataField(0) with pytest.raises(TypeError): _ = field[0] with pytest.raises(TypeError): _ = len(field) with pytest.raises(TypeError): _ = [x for x in field]
Example #26
Source File: ontonotes_ner.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, # type: ignore tokens , ner_tags = None) : u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields = {u'tokens': sequence} instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]}) # Add "tag label" to instance if ner_tags is not None: if self._coding_scheme == u"BIOUL": ner_tags = to_bioul(ner_tags, encoding=u"BIO") instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence) return Instance(instance_fields)
Example #27
Source File: data_iterator.py From magnitude with MIT License | 5 votes |
def add_epoch_number(batch , epoch ) : u""" Add the epoch number to the batch instances as a MetadataField. """ for instance in batch.instances: instance.fields[u'epoch_num'] = MetadataField(epoch) return batch
Example #28
Source File: sequence_tagging.py From allennlp with Apache License 2.0 | 5 votes |
def text_to_instance( # type: ignore self, tokens: List[Token], tags: List[str] = None ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) if tags is not None: fields["tags"] = SequenceLabelField(tags, sequence) return Instance(fields)
Example #29
Source File: universal_dependencies.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, # type: ignore words , upos_tags , dependencies = None) : # pylint: disable=arguments-differ u""" Parameters ---------- words : ``List[str]``, required. The words in the sentence to be encoded. upos_tags : ``List[str]``, required. The universal dependencies POS tags for each word. dependencies ``List[Tuple[str, int]]``, optional (default = None) A list of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. Returns ------- An instance containing words, upos tags, dependency head tags and head indices as fields. """ fields = {} tokens = TextField([Token(w) for w in words], self._token_indexers) fields[u"words"] = tokens fields[u"pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace=u"pos") if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields[u"head_tags"] = SequenceLabelField([x[0] for x in dependencies], tokens, label_namespace=u"head_tags") fields[u"head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies], tokens, label_namespace=u"head_index_tags") fields[u"metadata"] = MetadataField({u"words": words, u"pos": upos_tags}) return Instance(fields)
Example #30
Source File: arc_multichoice_with_facts_text_json_reader_multi_source.py From OpenBookQA with Apache License 2.0 | 4 votes |
def text_to_instance(self, # type: ignore item_id: Any, question_text: str, choice_text_list: List[str], facts_text_list: List[str], question2facts_mapping: List[float], choice2facts_mapping: List[List[float]], answer_id: int, meta_fields: Dict = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self.tokenize(question_text, "question") choices_tokens_list = [self.tokenize(x, "choice") for x in choice_text_list] facts_tokens_list = [self.tokenize(x, "fact") for x in facts_text_list] fields['question'] = TextField(question_tokens, self._token_indexers) fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list]) fields['facts_list'] = ListField([TextField(x, self._token_indexers) for x in facts_tokens_list]) fields['question2facts_map'] = ArrayField(np.asarray(question2facts_mapping)) fields['choice2facts_map'] = ArrayField(np.asarray(choice2facts_mapping)) fields['label'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "choice_text_list": choice_text_list, "facts_text_list": facts_text_list, "question_tokens": [x.text for x in question_tokens], "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list], "facts_tokens_list": [[x.text for x in ct] for ct in facts_tokens_list], "label_gold": answer_id, } if meta_fields is not None: for k, v in meta_fields.items(): metadata[k] = v fields["metadata"] = MetadataField(metadata) return Instance(fields)