Python allennlp.data.fields.MetadataField() Examples

The following are 30 code examples of allennlp.data.fields.MetadataField(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.fields , or try the search function .
Example #1
Source File: dataset_reader.py    From ConvLab with MIT License 6 votes vote down vote up
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
        intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        if tags:
            fields["tags"] = SequenceLabelField(tags, sequence)
        if domain:
            fields["domain"] = LabelField(domain, label_namespace="domain_labels")
        if intent:
            fields["intent"] = LabelField(intent, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
Example #2
Source File: arc_multichoice_json_reader.py    From ARC-Solvers with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         item_id: Any,
                         question_text: str,
                         choice_text_list: List[str],
                         answer_id: int) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question_text)
        choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list]
        fields['question'] = TextField(question_tokens, self._token_indexers)
        fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list])
        fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
           "id": item_id,
           "question_text": question_text,
           "choice_text_list": choice_text_list,
           "question_tokens": [x.text for x in question_tokens],
           "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
        }

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields) 
Example #3
Source File: ir_single_sequence_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self, seq_id:str, seq_text:str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        seq_id_field = MetadataField(seq_id)

        seq_tokenized = self._tokenizer.tokenize(seq_text)

        if self.max_seq_length > -1:
            seq_tokenized = seq_tokenized[:self.max_seq_length]
        if self.min_seq_length > -1 and len(seq_tokenized) < self.min_seq_length:
            seq_tokenized = seq_tokenized + [self.padding_value] * (self.min_seq_length - len(seq_tokenized))

        seq_field = TextField(seq_tokenized, self._token_indexers)
        
        return Instance({
            "seq_id":seq_id_field,
            "seq_tokens":seq_field}) 
Example #4
Source File: bert_labeled_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = MetadataField(query_id)
        doc_id_field = MetadataField(doc_id)

        query_tokenized = self._tokenizer.tokenize(query_sequence)

        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]
        if self.min_query_length > -1 and len(query_tokenized) < self.min_query_length:
            query_tokenized = query_tokenized + [self.padding_value] * (self.min_query_length - len(query_tokenized))

        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]
        if self.min_doc_length > -1 and len(doc_tokenized) < self.min_doc_length:
            doc_tokenized = doc_tokenized + [self.padding_value] * (self.min_doc_length - len(doc_tokenized))

        doc_field = TextField(query_tokenized + [self.sep_value] + doc_tokenized, self._token_indexers)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "doc_tokens":doc_field}) 
Example #5
Source File: citation_data_reader_scicite_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         section_name: str = None) -> Instance:

        citation_tokens = self._tokenizer.tokenize(citation_text)

        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if section_name is not None:
            fields['section_label'] = LabelField(section_name, label_namespace="section_labels")

        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        return Instance(fields) 
Example #6
Source File: citation_data_reader_aclarc_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         cleaned_cite_text: str = None,
                         section_name: str = None,
                         is_citation: bool = None) -> Instance:

        citation_tokens = self._tokenizer.tokenize(citation_text)
        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if is_citation is not None:
            fields['is_citation'] = LabelField(str(is_citation), label_namespace="cite_worthiness_labels")
        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        return Instance(fields) 
Example #7
Source File: citation_data_reader_aclarc_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         venue: str = None,
                         section_name: str = None) -> Instance:

        citation_tokens = self._tokenizer.tokenize(citation_text)

        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if section_name is not None:
            fields['section_label'] = LabelField(section_name, label_namespace="section_labels")
        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        return Instance(fields) 
Example #8
Source File: entailment_pair.py    From multee with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self, # pylint: disable=arguments-differ
                         premise: str,
                         hypothesis: str,
                         label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        premise_tokens = [Token(token.text)
                          for token in self._tokenizer.tokenize(premise)[-self._max_tokens:]]
        hypothesis_tokens = [Token(token.text)
                             for token in self._tokenizer.tokenize(hypothesis)[-self._max_tokens:]]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)

        if label:
            fields['label'] = LabelField(label)

        # metadata = {"premise_tokens": [x.text for x in premise_tokens],
        #             "hypothesis_tokens": [x.text for x in hypothesis_tokens]}
        # fields["metadata"] = MetadataField(metadata)
        return Instance(fields) 
Example #9
Source File: snli.py    From magnitude with MIT License 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         premise     ,
                         hypothesis     ,
                         label      = None)            :
        # pylint: disable=arguments-differ
        fields                   = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)
        fields[u'premise'] = TextField(premise_tokens, self._token_indexers)
        fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
        if label:
            fields[u'label'] = LabelField(label)

        metadata = {u"premise_tokens": [x.text for x in premise_tokens],
                    u"hypothesis_tokens": [x.text for x in hypothesis_tokens]}
        fields[u"metadata"] = MetadataField(metadata)
        return Instance(fields) 
Example #10
Source File: dataset_reader.py    From ConvLab with MIT License 6 votes vote down vote up
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None,
        intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        # print([t.text for t in context_tokens])
        fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
        fields["tokens"] = TextField(tokens, self._token_indexers)
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, fields["tokens"])
        if intents is not None:
            fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
Example #11
Source File: interleaving_dataset_reader.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def _read_all_at_once(self, datasets: Mapping[str, Iterable[Instance]]) -> Iterable[Instance]:
        for key, dataset in datasets.items():
            for instance in dataset:
                instance.fields[self._dataset_field_name] = MetadataField(key)
                yield instance 
Example #12
Source File: depend_parse.py    From glyce with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         words: List[str],
                         upos_tags: List[str],
                         dependencies: List[Tuple[str, int]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        words : ``List[str]``, required.
            The words in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.

        Returns
        -------
        An instance containing words, upos tags, dependency head tags and head
        indices as fields.
        """
        fields: Dict[str, Field] = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields["words"] = tokens
        fields["pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace="pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace="head_tags")
            fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace="head_index_tags")

        fields["metadata"] = MetadataField({"words": words, "pos": upos_tags})
        return Instance(fields) 
Example #13
Source File: open_nre_nyt_reader.py    From DISTRE with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         sentence: str,
                         head: str,
                         tail: str,
                         head_type: str=None,
                         tail_type: str=None,
                         label: str=None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        instance_id = f'{head}#{tail}'
        if label:
            instance_id = f'{instance_id}#{label}'

        fields['metadata'] = MetadataField({'instance_id': instance_id.lower()})

        tokens = self._token_splitter.split_words(sentence)
        head = self._token_splitter.split_words(head)
        tail = self._token_splitter.split_words(tail)

        # TODO: this should not be done here

        if self._masking_mode == 'ner_least_specific':
            logger.info(f"Using masking mode 'ner_least_specific'.")
            tokens = ([Token('__start__')]
                      + head + [Token('__del1__')] + head_type + [Token('__ent1__')]
                      + tail + [Token('__del2__')] + tail_type + [Token('__ent2__')]
                      + tokens + [Token('__clf__')])
        else:
            tokens = [Token('__start__')] + head + [Token('__del1__')] + tail + [Token('__del2__')] + tokens + [Token('__clf__')]

        fields['sentence'] = TextField(tokens, self._token_indexers)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields) 
Example #14
Source File: ebmnlp.py    From scibert with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,
                         tokens: List[Token],
                         pico_tags: List[str] = None):
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        
        # Set the field 'labels' according to the specified PIO element
        if pico_tags is not None:
            instance_fields['tags'] = SequenceLabelField(pico_tags, sequence, self.label_namespace)

        return Instance(instance_fields) 
Example #15
Source File: ir_labeled_tuple_loader.py    From transformer-kernel-ranking with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = MetadataField(query_id)
        doc_id_field = MetadataField(doc_id)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]
        if self.min_query_length > -1 and len(query_tokenized) < self.min_query_length:
            query_tokenized = query_tokenized + [self.padding_value] * (self.min_query_length - len(query_tokenized))

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        #doc_tokenized.insert(0, Token(START_SYMBOL))
        #doc_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]
        if self.min_doc_length > -1 and len(doc_tokenized) < self.min_doc_length:
            doc_tokenized = doc_tokenized + [self.padding_value] * (self.min_doc_length - len(doc_tokenized))

        doc_field = TextField(doc_tokenized, self._token_indexers)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field}) 
Example #16
Source File: classification_dataset_reader.py    From scibert with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,
                         text: str,
                         label: str = None,
                         metadata: Any = None) -> Instance:  # type: ignore
        text_tokens = self._tokenizer.tokenize(text)
        fields = {
            'text': TextField(text_tokens, self._token_indexers),
        }
        if label is not None:
            fields['label'] = LabelField(label)

        if metadata:
            fields['metadata'] = MetadataField(metadata)
        return Instance(fields) 
Example #17
Source File: arc_multichoice_json_reader.py    From OpenBookQA with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         item_id: Any,
                         question_text: str,
                         choice_text_list: List[str],
                         answer_id: int
                         ) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question_text)
        choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list]
        fields['question'] = TextField(question_tokens, self._token_indexers)
        fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list])
        fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
            "id": item_id,
            "question_text": question_text,
            "choice_text_list": choice_text_list,
            "question_tokens": [x.text for x in question_tokens],
            "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
        }

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields) 
Example #18
Source File: datareader.py    From NLP_Toolkit with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, tokens: List[Token], tags: List[str] = None,
                         words: List[str] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        fields["metadata"] = MetadataField({"words": words})
        if tags is not None:
            labels, detect_tags, complex_flag_dict = self.extract_tags(tags)
            if self._skip_complex and complex_flag_dict[self._skip_complex] > 0:
                return None
            rnd = random()
            # skip TN
            if self._skip_correct and all(x == "CORRECT" for x in detect_tags):
                if rnd > self._tn_prob:
                    return None
            # skip TP
            else:
                if rnd > self._tp_prob:
                    return None

            fields["labels"] = SequenceLabelField(labels, sequence,
                                                  label_namespace="labels")
            fields["d_tags"] = SequenceLabelField(detect_tags, sequence,
                                                  label_namespace="d_tags")
        return Instance(fields) 
Example #19
Source File: bert_reader_sent_selection.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         sent1: str,  # Important type information
                         sent2: str,
                         pid: str = None,
                         label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(sent1)
        tokenized_text2 = self.bert_tokenizer.tokenize(sent2)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.query_l]
        tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))]

        joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]']
        text1_len = len(tokenized_text1) + 2
        text2_len = len(tokenized_text2) + 1
        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2))

        fields['bert_s1_span'] = MetadataField(text1_span)
        fields['bert_s2_span'] = MetadataField(text2_span)

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields) 
Example #20
Source File: paired_span_pred_reader.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         example) -> Instance:

        fields: Dict[str, Field] = {}

        joint_tokens_seq = example['paired_c_tokens']
        assert len(joint_tokens_seq) <= 512

        segments_ids = example['segment_ids']

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        # This text span is begin inclusive and end exclusive.
        # text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use)
        # text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens']))

        # fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span)
        # fields['bert_s1_span'] = MetadataField(text1_span)
        # fields['bert_s2_span'] = MetadataField(text2_span)

        # However, the ground truth span is begin and end both inclusive
        fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence'])

        fields['fid'] = IdField(example['fid'])
        fields['uid'] = IdField(example['uid'])

        return Instance(fields) 
Example #21
Source File: span_pred_reader.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         example) -> Instance:

        fields: Dict[str, Field] = {}

        joint_tokens_seq = ['[CLS]'] + example['query_c_tokens'] + ['[SEP]'] + example['context_c_tokens'] + ['[SEP]']
        assert len(joint_tokens_seq) < 512

        text1_len = len(example['query_c_tokens']) + 2
        text2_len = len(example['context_c_tokens']) + 1

        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        # This text span is begin inclusive and end exclusive.
        text1_span = (1, 1 + len(example['query_c_tokens'])) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(example['context_c_tokens']))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])
        # fields['bert_s2_span'] = SpanField(text2_span)
        # fields['bert_s1_span'] = MetadataField(text1_span)
        # fields['bert_s2_span'] = MetadataField(text2_span)

        # However, the ground truth span is begin and end both inclusive
        fields['gt_span'] = SpanField(example['start_position'], example['end_position'], fields['paired_sequence'])

        fields['fid'] = IdField(example['fid'])
        fields['uid'] = IdField(example['uid'])

        return Instance(fields) 
Example #22
Source File: sequence_tagging.py    From magnitude with MIT License 5 votes vote down vote up
def text_to_instance(self, tokens             , tags            = None)            :  # type: ignore
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        sequence = TextField(tokens, self._token_indexers)
        fields[u"tokens"] = sequence
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        if tags is not None:
            fields[u"tags"] = SequenceLabelField(tags, sequence)
        return Instance(fields) 
Example #23
Source File: interleaving_dataset_reader.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def _read_round_robin(self, datasets: Mapping[str, Iterable[Instance]]) -> Iterable[Instance]:
        remaining = set(datasets)
        dataset_iterators = {key: iter(dataset) for key, dataset in datasets.items()}

        while remaining:
            for key, dataset in dataset_iterators.items():
                if key in remaining:
                    try:
                        instance = next(dataset)
                        instance.fields[self._dataset_field_name] = MetadataField(key)
                        yield instance
                    except StopIteration:
                        remaining.remove(key) 
Example #24
Source File: metadata_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_mapping_works_with_dict(self):
        field = MetadataField({"a": 1, "b": [0]})

        assert "a" in field
        assert field["a"] == 1
        assert len(field) == 2

        keys = {k for k in field}
        assert keys == {"a", "b"}

        values = [v for v in field.values()]
        assert len(values) == 2
        assert 1 in values
        assert [0] in values 
Example #25
Source File: metadata_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_mapping_raises_with_non_dict(self):
        field = MetadataField(0)

        with pytest.raises(TypeError):
            _ = field[0]

        with pytest.raises(TypeError):
            _ = len(field)

        with pytest.raises(TypeError):
            _ = [x for x in field] 
Example #26
Source File: ontonotes_ner.py    From magnitude with MIT License 5 votes vote down vote up
def text_to_instance(self, # type: ignore
                         tokens             ,
                         ner_tags            = None)            :
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields                   = {u'tokens': sequence}
        instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        # Add "tag label" to instance
        if ner_tags is not None:
            if self._coding_scheme == u"BIOUL":
                ner_tags = to_bioul(ner_tags, encoding=u"BIO")
            instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence)
        return Instance(instance_fields) 
Example #27
Source File: data_iterator.py    From magnitude with MIT License 5 votes vote down vote up
def add_epoch_number(batch       , epoch     )         :
    u"""
    Add the epoch number to the batch instances as a MetadataField.
    """
    for instance in batch.instances:
        instance.fields[u'epoch_num'] = MetadataField(epoch)
    return batch 
Example #28
Source File: sequence_tagging.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def text_to_instance(  # type: ignore
        self, tokens: List[Token], tags: List[str] = None
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, sequence)
        return Instance(fields) 
Example #29
Source File: universal_dependencies.py    From magnitude with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         words           ,
                         upos_tags           ,
                         dependencies                        = None)            :
        # pylint: disable=arguments-differ
        u"""
        Parameters
        ----------
        words : ``List[str]``, required.
            The words in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.

        Returns
        -------
        An instance containing words, upos tags, dependency head tags and head
        indices as fields.
        """
        fields                   = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields[u"words"] = tokens
        fields[u"pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace=u"pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields[u"head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace=u"head_tags")
            fields[u"head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace=u"head_index_tags")

        fields[u"metadata"] = MetadataField({u"words": words, u"pos": upos_tags})
        return Instance(fields) 
Example #30
Source File: arc_multichoice_with_facts_text_json_reader_multi_source.py    From OpenBookQA with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         item_id: Any,
                         question_text: str,
                         choice_text_list: List[str],
                         facts_text_list: List[str],
                         question2facts_mapping: List[float],
                         choice2facts_mapping: List[List[float]],
                         answer_id: int,
                         meta_fields: Dict = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        question_tokens = self.tokenize(question_text, "question")
        choices_tokens_list = [self.tokenize(x, "choice") for x in choice_text_list]
        facts_tokens_list = [self.tokenize(x, "fact") for x in facts_text_list]

        fields['question'] = TextField(question_tokens, self._token_indexers)
        fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list])
        fields['facts_list'] = ListField([TextField(x, self._token_indexers) for x in facts_tokens_list])
        fields['question2facts_map'] = ArrayField(np.asarray(question2facts_mapping))
        fields['choice2facts_map'] = ArrayField(np.asarray(choice2facts_mapping))

        fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
            "id": item_id,
            "question_text": question_text,
            "choice_text_list": choice_text_list,
            "facts_text_list": facts_text_list,
            "question_tokens": [x.text for x in question_tokens],
            "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
            "facts_tokens_list": [[x.text for x in ct] for ct in facts_tokens_list],
            "label_gold": answer_id,
        }

        if meta_fields is not None:
            for k, v in meta_fields.items():
                metadata[k] = v

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)