Python Examples of allennlp.data.fields.SequenceLabelField

Source File: dataset_reader.py From ConvLab with MIT License

6 votes

def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
        intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        if tags:
            fields["tags"] = SequenceLabelField(tags, sequence)
        if domain:
            fields["domain"] = LabelField(domain, label_namespace="domain_labels")
        if intent:
            fields["intent"] = LabelField(intent, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields)

Source File: summarization_sentence_tagger_reader.py From summarus with Apache License 2.0

6 votes

def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance:
        if sentences is None:
            if self._language == "ru":
                sentences = [s.text for s in razdel.sentenize(text)]
            else:
                sentences = nltk.tokenize.sent_tokenize(text)
        sentences_tokens = []
        for sentence in sentences[:self._max_sentences_count]:
            sentence = sentence.lower() if self._lowercase else sentence
            tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens]
            tokens.insert(0, Token(START_SYMBOL))
            tokens.append(Token(END_SYMBOL))
            indexed_tokens = TextField(tokens, self._source_token_indexers)
            sentences_tokens.append(indexed_tokens)

        sentences_tokens_indexed = ListField(sentences_tokens)
        result = {'source_sentences': sentences_tokens_indexed}

        if tags:
            result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed)
        return Instance(result)

Source File: template_text2sql.py From allennlp-semparse with Apache License 2.0

6 votes

def text_to_instance(
        self,  # type: ignore
        query: List[str],
        slot_tags: List[str] = None,
        sql_template: str = None,
    ) -> Instance:
        fields: Dict[str, Field] = {}
        tokens = TextField([Token(t) for t in query], self._token_indexers)
        fields["tokens"] = tokens

        if slot_tags is not None and sql_template is not None:
            slot_field = SequenceLabelField(slot_tags, tokens, label_namespace="slot_tags")
            template = LabelField(sql_template, label_namespace="template_labels")
            fields["slot_tags"] = slot_field
            fields["template"] = template

        return Instance(fields)

Source File: prolocal_dataset_reader.py From propara with Apache License 2.0

6 votes

def text_to_instance(self,  # type: ignore
                         sentence_tokens: List[str],
                         verb_vector: List[int],
                         entity_vector: List[int],
                         state_change_types: Optional[List[str]] = None,
                         state_change_tags: Optional[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        # encode inputs
        token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers)
        fields['tokens'] = token_field
        fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags')
        fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags')

        # encode outputs
        if state_change_types:
            fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels')
        if state_change_tags:
            fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags')

        return Instance(fields)

Source File: dataset_reader.py From nanigonet with MIT License

6 votes

def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance:

        if len(tokens) > self._max_token_len:
            tokens = tokens[:self._max_token_len]
            print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...')
            if tags:
                tags = tags[:self._max_token_len]

        fields = {}

        text_field = TextField(tokens, self._token_indexers)
        fields['tokens'] = text_field
        if tags:
            fields['tags'] = SequenceLabelField(tags, text_field)

        return Instance(fields)

Source File: semantic_role_labeling.py From magnitude with MIT License

6 votes

def text_to_instance(self,  # type: ignore
                         tokens             ,
                         verb_label           ,
                         tags            = None)            :
        u"""
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields[u'tokens'] = text_field
        fields[u'verb_indicator'] = SequenceLabelField(verb_label, text_field)
        if tags:
            fields[u'tags'] = SequenceLabelField(tags, text_field)

        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokens[verb_label.index(1)].text
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens],
                                            u"verb": verb})
        return Instance(fields)

Source File: dataset_reader.py From ConvLab with MIT License

6 votes

def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None,
        intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        # print([t.text for t in context_tokens])
        fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
        fields["tokens"] = TextField(tokens, self._token_indexers)
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, fields["tokens"])
        if intents is not None:
            fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields)

Source File: vcr.py From r2c with MIT License

5 votes

def _fix_tokenization(tokenized_sent, bert_embs, old_det_to_new_ind, obj_to_type, token_indexers, pad_ind=-1):
    """
    Turn a detection list into what we want: some text, as well as some tags.
    :param tokenized_sent: Tokenized sentence with detections collapsed to a list.
    :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag)
    :param obj_to_type: [person, person, pottedplant] indexed by the old labels
    :return: tokenized sentence
    """

    new_tokenization_with_tags = []
    for tok in tokenized_sent:
        if isinstance(tok, list):
            for int_name in tok:
                obj_type = obj_to_type[int_name]
                new_ind = old_det_to_new_ind[int_name]
                if new_ind < 0:
                    raise ValueError("Oh no, the new index is negative! that means it's invalid. {} {}".format(
                        tokenized_sent, old_det_to_new_ind
                    ))
                text_to_use = GENDER_NEUTRAL_NAMES[
                    new_ind % len(GENDER_NEUTRAL_NAMES)] if obj_type == 'person' else obj_type
                new_tokenization_with_tags.append((text_to_use, new_ind))
        else:
            new_tokenization_with_tags.append((tok, pad_ind))

    text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags],
                           bert_embs,
                           padding_value=0)
    tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field)
    return text_field, tags

Source File: depend_parse.py From glyce with Apache License 2.0

5 votes

def text_to_instance(self,  # type: ignore
                         words: List[str],
                         upos_tags: List[str],
                         dependencies: List[Tuple[str, int]] = None) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        words : ``List[str]``, required.
            The words in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.

        Returns
        -------
        An instance containing words, upos tags, dependency head tags and head
        indices as fields.
        """
        fields: Dict[str, Field] = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields["words"] = tokens
        fields["pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace="pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace="head_tags")
            fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace="head_index_tags")

        fields["metadata"] = MetadataField({"words": words, "pos": upos_tags})
        return Instance(fields)

Source File: rebalance_dataset_ensemble.py From swagaf with MIT License

5 votes

def collate(self, items_l):
        # Assume all of these have the same length
        index_l, second_sentences_l, pos_tags_l, feats_l, context_len_l = zip(*items_l)

        feats = Variable(torch.FloatTensor(np.stack(feats_l)))
        inds = np.array(index_l)

        instances = []
        for second_sentences, pos_tags, context_len in zip(second_sentences_l, pos_tags_l, context_len_l):
            for second_sent, pos_tag in zip(second_sentences, pos_tags):
                instance_d = {
                    'words': TextField([Token(token) for token in ['@@bos@@'] + second_sent + ['@@eos@@']],
                                       {'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True)}),
                    'postags': TextField([Token(token) for token in ['@@bos@@'] + pos_tag + ['@@eos@@']],
                                         {'pos': SingleIdTokenIndexer(namespace='pos', lowercase_tokens=False)}),
                }
                instance_d['context_indicator'] = SequenceLabelField([1] * (context_len + 1) +
                                                                     [0] * (len(second_sent) - context_len + 1),
                                                                     instance_d['words'])
                instances.append(Instance(instance_d))
        batch = Batch(instances)
        batch.index_instances(vocab)
        tensor_dict = batch.as_tensor_dict(for_training=self.train)

        # instances_mask = torch.LongTensor(np.stack([np.array([len(sub_g) > 0 for sub_g in g], dtype=np.int64)
        #                                             for g in selected_gens]))
        return {
            'lm_feats': feats,
            'inds': inds,
            'ending_word_ids': tensor_dict['words']['tokens'].view(inds.shape[0], -1,
                                                                   tensor_dict['words']['tokens'].size(1)),
            'postags_word_ids': tensor_dict['postags']['pos'].view(inds.shape[0], -1,
                                                                   tensor_dict['postags']['pos'].size(1)),
            'ctx_indicator': tensor_dict['context_indicator'].view(inds.shape[0], -1,
                                                                   tensor_dict['context_indicator'].size(1)),
        }

Source File: conll_reader.py From allennlp_tutorial with MIT License

5 votes

def text_to_instance(self,
                         words: List[str],
                         ner_tags: List[str]) -> Instance:
        fields: Dict[str, Field] = {}
        # wrap each token in the file with a token object
        tokens = TextField([Token(w) for w in words], self._token_indexers)

        # Instances in AllenNLP are created using Python dictionaries,
        # which map the token key to the Field type
        fields["tokens"] = tokens
        fields["label"] = SequenceLabelField(ner_tags, tokens)

        return Instance(fields)

Source File: vcr.py From HGL-pytorch with MIT License

5 votes

def _fix_tokenization(tokenized_sent, bert_embs, old_det_to_new_ind, obj_to_type, token_indexers, pad_ind=-1):
    """
    Turn a detection list into what we want: some text, as well as some tags.
    :param tokenized_sent: Tokenized sentence with detections collapsed to a list.
    :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag)
    :param obj_to_type: [person, person, pottedplant] indexed by the old labels
    :return: tokenized sentence
    """

    new_tokenization_with_tags = []
    for tok in tokenized_sent:
        if isinstance(tok, list):
            for int_name in tok:
                obj_type = obj_to_type[int_name]
                new_ind = old_det_to_new_ind[int_name]
                if new_ind < 0:
                    raise ValueError("Oh no, the new index is negative! that means it's invalid. {} {}".format(
                        tokenized_sent, old_det_to_new_ind
                    ))
                text_to_use = GENDER_NEUTRAL_NAMES[
                    new_ind % len(GENDER_NEUTRAL_NAMES)] if obj_type == 'person' else obj_type
                new_tokenization_with_tags.append((text_to_use, new_ind))
        else:
            new_tokenization_with_tags.append((tok, pad_ind))

    text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags],
                           bert_embs,
                           padding_value=0)
    tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field)
    return text_field, tags

Source File: ebmnlp.py From scibert with Apache License 2.0

5 votes

def text_to_instance(self,
                         tokens: List[Token],
                         pico_tags: List[str] = None):
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        
        # Set the field 'labels' according to the specified PIO element
        if pico_tags is not None:
            instance_fields['tags'] = SequenceLabelField(pico_tags, sequence, self.label_namespace)

        return Instance(instance_fields)

Source File: relation_instances_reader.py From comb_dist_direct_relex with Apache License 2.0

5 votes

def _tokens_distances_fields(self, tokens):
        """Returns the updated list of tokens and entity distances for the first and second entity as fields."""
        tokens, positions1, positions2 = self._tokens_distances(tokens)
        t_f = TextField(tokens, self._token_indexers)
        p1_f = SequenceLabelField(positions1, t_f)
        p2_f = SequenceLabelField(positions2, t_f)
        return t_f, p1_f, p2_f

Source File: sequence_tagging.py From allennlp with Apache License 2.0

5 votes

def text_to_instance(  # type: ignore
        self, tokens: List[Token], tags: List[str] = None
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, sequence)
        return Instance(fields)

Source File: list_field_test.py From magnitude with MIT License

5 votes

def setUp(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace(u"this", u"words")
        self.vocab.add_token_to_namespace(u"is", u"words")
        self.vocab.add_token_to_namespace(u"a", u"words")
        self.vocab.add_token_to_namespace(u"sentence", u'words')
        self.vocab.add_token_to_namespace(u"s", u'characters')
        self.vocab.add_token_to_namespace(u"e", u'characters')
        self.vocab.add_token_to_namespace(u"n", u'characters')
        self.vocab.add_token_to_namespace(u"t", u'characters')
        self.vocab.add_token_to_namespace(u"c", u'characters')
        for label in [u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k']:
            self.vocab.add_token_to_namespace(label, u'labels')

        self.word_indexer = {u"words": SingleIdTokenIndexer(u"words")}
        self.words_and_characters_indexers = {u"words": SingleIdTokenIndexer(u"words"),
                                              u"characters": TokenCharactersIndexer(u"characters")}
        self.field1 = TextField([Token(t) for t in [u"this", u"is", u"a", u"sentence"]],
                                self.word_indexer)
        self.field2 = TextField([Token(t) for t in [u"this", u"is", u"a", u"different", u"sentence"]],
                                self.word_indexer)
        self.field3 = TextField([Token(t) for t in [u"this", u"is", u"another", u"sentence"]],
                                self.word_indexer)

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        super(TestListField, self).setUp()

Source File: universal_dependencies.py From magnitude with MIT License

5 votes

def text_to_instance(self,  # type: ignore
                         words           ,
                         upos_tags           ,
                         dependencies                        = None)            :
        # pylint: disable=arguments-differ
        u"""
        Parameters
        ----------
        words : ``List[str]``, required.
            The words in the sentence to be encoded.
        upos_tags : ``List[str]``, required.
            The universal dependencies POS tags for each word.
        dependencies ``List[Tuple[str, int]]``, optional (default = None)
            A list of  (head tag, head index) tuples. Indices are 1 indexed,
            meaning an index of 0 corresponds to that word being the root of
            the dependency tree.

        Returns
        -------
        An instance containing words, upos tags, dependency head tags and head
        indices as fields.
        """
        fields                   = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields[u"words"] = tokens
        fields[u"pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace=u"pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields[u"head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace=u"head_tags")
            fields[u"head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace=u"head_index_tags")

        fields[u"metadata"] = MetadataField({u"words": words, u"pos": upos_tags})
        return Instance(fields)

Source File: datareader.py From NLP_Toolkit with Apache License 2.0

5 votes

def text_to_instance(self, tokens: List[Token], tags: List[str] = None,
                         words: List[str] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        fields["metadata"] = MetadataField({"words": words})
        if tags is not None:
            labels, detect_tags, complex_flag_dict = self.extract_tags(tags)
            if self._skip_complex and complex_flag_dict[self._skip_complex] > 0:
                return None
            rnd = random()
            # skip TN
            if self._skip_correct and all(x == "CORRECT" for x in detect_tags):
                if rnd > self._tn_prob:
                    return None
            # skip TP
            else:
                if rnd > self._tp_prob:
                    return None

            fields["labels"] = SequenceLabelField(labels, sequence,
                                                  label_namespace="labels")
            fields["d_tags"] = SequenceLabelField(detect_tags, sequence,
                                                  label_namespace="d_tags")
        return Instance(fields)

Source File: sequence_tagging.py From magnitude with MIT License

5 votes

def text_to_instance(self, tokens             , tags            = None)            :  # type: ignore
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        sequence = TextField(tokens, self._token_indexers)
        fields[u"tokens"] = sequence
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        if tags is not None:
            fields[u"tags"] = SequenceLabelField(tags, sequence)
        return Instance(fields)

Source File: ontonotes_ner.py From magnitude with MIT License

5 votes

def text_to_instance(self, # type: ignore
                         tokens             ,
                         ner_tags            = None)            :
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields                   = {u'tokens': sequence}
        instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        # Add "tag label" to instance
        if ner_tags is not None:
            if self._coding_scheme == u"BIOUL":
                ner_tags = to_bioul(ner_tags, encoding=u"BIO")
            instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence)
        return Instance(instance_fields)

Source File: input_reduction.py From allennlp with Apache License 2.0

5 votes

def _get_ner_tags_and_mask(
    instance: Instance, input_field_to_attack: str, ignore_tokens: List[str]
):
    """
    Used for the NER task. Sets the num_ignore tokens, saves the original predicted tag and a 0/1
    mask in the position of the tags
    """
    # Set num_ignore_tokens
    num_ignore_tokens = 0
    input_field: TextField = instance[input_field_to_attack]  # type: ignore
    for token in input_field.tokens:
        if str(token) in ignore_tokens:
            num_ignore_tokens += 1

    # save the original tags and a 0/1 mask where the tags are
    tag_mask = []
    original_tags = []
    tag_field: SequenceLabelField = instance["tags"]  # type: ignore
    for label in tag_field.labels:
        if label != "O":
            tag_mask.append(1)
            original_tags.append(label)
            num_ignore_tokens += 1
        else:
            tag_mask.append(0)
    return num_ignore_tokens, tag_mask, original_tags

Source File: conll2003.py From magnitude with MIT License

4 votes

def text_to_instance(self, # type: ignore
                         tokens             ,
                         pos_tags            = None,
                         chunk_tags            = None,
                         ner_tags            = None)            :
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields                   = {u'tokens': sequence}
        instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})

        # Recode the labels if necessary.
        if self.coding_scheme == u"BIOUL":
            coded_chunks = to_bioul(chunk_tags) if chunk_tags is not None else None
            coded_ner = to_bioul(ner_tags) if ner_tags is not None else None
        else:
            # the default IOB1
            coded_chunks = chunk_tags
            coded_ner = ner_tags

        # Add "feature labels" to instance
        if u'pos' in self.feature_labels:
            if pos_tags is None:
                raise ConfigurationError(u"Dataset reader was specified to use pos_tags as "
                                         u"features. Pass them to text_to_instance.")
            instance_fields[u'pos_tags'] = SequenceLabelField(pos_tags, sequence, u"pos_tags")
        if u'chunk' in self.feature_labels:
            if coded_chunks is None:
                raise ConfigurationError(u"Dataset reader was specified to use chunk tags as "
                                         u"features. Pass them to text_to_instance.")
            instance_fields[u'chunk_tags'] = SequenceLabelField(coded_chunks, sequence, u"chunk_tags")
        if u'ner' in self.feature_labels:
            if coded_ner is None:
                raise ConfigurationError(u"Dataset reader was specified to use NER tags as "
                                         u" features. Pass them to text_to_instance.")
            instance_fields[u'ner_tags'] = SequenceLabelField(coded_ner, sequence, u"ner_tags")

        # Add "tag label" to instance
        if self.tag_label == u'ner' and coded_ner is not None:
            instance_fields[u'tags'] = SequenceLabelField(coded_ner, sequence)
        elif self.tag_label == u'pos' and pos_tags is not None:
            instance_fields[u'tags'] = SequenceLabelField(pos_tags, sequence)
        elif self.tag_label == u'chunk' and coded_chunks is not None:
            instance_fields[u'tags'] = SequenceLabelField(coded_chunks, sequence)

        return Instance(instance_fields)

Source File: universal_dependencies.py From udify with MIT License

4 votes

def text_to_instance(self,  # type: ignore
                         words: List[str],
                         lemmas: List[str] = None,
                         lemma_rules: List[str] = None,
                         upos_tags: List[str] = None,
                         xpos_tags: List[str] = None,
                         feats: List[str] = None,
                         dependencies: List[Tuple[str, int]] = None,
                         ids: List[str] = None,
                         multiword_ids: List[str] = None,
                         multiword_forms: List[str] = None) -> Instance:
        fields: Dict[str, Field] = {}

        tokens = TextField([Token(w) for w in words], self._token_indexers)
        fields["tokens"] = tokens

        names = ["upos", "xpos", "feats", "lemmas"]
        all_tags = [upos_tags, xpos_tags, feats, lemma_rules]
        for name, field in zip(names, all_tags):
            if field:
                fields[name] = SequenceLabelField(field, tokens, label_namespace=name)

        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace="head_tags")
            fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace="head_index_tags")

        fields["metadata"] = MetadataField({
            "words": words,
            "upos_tags": upos_tags,
            "xpos_tags": xpos_tags,
            "feats": feats,
            "lemmas": lemmas,
            "lemma_rules": lemma_rules,
            "ids": ids,
            "multiword_ids": multiword_ids,
            "multiword_forms": multiword_forms
        })

        return Instance(fields)

Source File: list_field_test.py From allennlp with Apache License 2.0

4 votes

def setup_method(self):
        self.vocab = Vocabulary()
        self.vocab.add_token_to_namespace("this", "words")
        self.vocab.add_token_to_namespace("is", "words")
        self.vocab.add_token_to_namespace("a", "words")
        self.vocab.add_token_to_namespace("sentence", "words")
        self.vocab.add_token_to_namespace("s", "characters")
        self.vocab.add_token_to_namespace("e", "characters")
        self.vocab.add_token_to_namespace("n", "characters")
        self.vocab.add_token_to_namespace("t", "characters")
        self.vocab.add_token_to_namespace("c", "characters")
        for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]:
            self.vocab.add_token_to_namespace(label, "labels")

        self.word_indexer = {"words": SingleIdTokenIndexer("words")}
        self.words_and_characters_indexers = {
            "words": SingleIdTokenIndexer("words"),
            "characters": TokenCharactersIndexer("characters", min_padding_length=1),
        }
        self.field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer
        )
        self.field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer
        )
        self.field3 = TextField(
            [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer
        )

        self.empty_text_field = self.field1.empty_field()
        self.index_field = IndexField(1, self.field1)
        self.empty_index_field = self.index_field.empty_field()
        self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1)
        self.empty_sequence_label_field = self.sequence_label_field.empty_field()

        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        empty_list_field = ListField([text_field.empty_field()])
        empty_fields = {"list_tensor": empty_list_field}
        self.empty_instance = Instance(empty_fields)

        non_empty_list_field = ListField([text_field])
        non_empty_fields = {"list_tensor": non_empty_list_field}
        self.non_empty_instance = Instance(non_empty_fields)

        super().setup_method()

Source File: transition_eds_reader.py From HIT-SCIR-CoNLL2019 with Apache License 2.0

4 votes

def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         lemmas: List[str] = None,
                         pos_tags: List[str] = None,
                         arc_indices: List[Tuple[int, int]] = None,
                         arc_tags: List[str] = None,
                         gold_actions: List[str] = None,
                         root_id: List[int] = None,
                         meta_info: List[str] = None,
                         concept_label: List[int] = None,
                         tokens_range: List[Tuple[int, int]] = None,
                         gold_mrps: List[str] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        token_field = TextField([Token(t) for t in tokens], self._token_indexers)

        fields["tokens"] = token_field
        meta_dict = {"tokens": tokens}

        if lemmas is not None and self._lemma_indexers is not None:
            fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers)
        if pos_tags is not None:
            fields["pos_tags"] = SequenceLabelField(pos_tags, token_field, label_namespace="pos")

        if arc_indices is not None and arc_tags is not None:
            meta_dict["arc_indices"] = arc_indices
            meta_dict["arc_tags"] = arc_tags
            fields["arc_tags"] = TextField([Token(a) for a in arc_tags], self._arc_tag_indexers)

        if gold_actions is not None:
            meta_dict["gold_actions"] = gold_actions
            fields["gold_actions"] = TextField([Token(a) for a in gold_actions], self._action_indexers)

        if meta_info is not None:
            meta_dict["meta_info"] = meta_info[0]

        if gold_mrps is not None:
            meta_dict["gold_mrps"] = gold_mrps[0]

        if tokens_range is not None:
            meta_dict["tokens_range"] = tokens_range

        if concept_label is not None:
            meta_dict["concept_label"] = concept_label
            fields["concept_label"] = TextField([Token(a) for a in concept_label], self._concept_label_indexers)

        if root_id is not None:
            meta_dict["root_id"] = root_id[0]

        fields["metadata"] = MetadataField(meta_dict)

        return Instance(fields)

Source File: transition_sdp_reader.py From HIT-SCIR-CoNLL2019 with Apache License 2.0

4 votes

def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         lemmas: List[str] = None,
                         mrp_pos_tags: List[str] = None,
                         arc_indices: List[Tuple[int, int]] = None,
                         arc_tags: List[str] = None,
                         gold_actions: List[str] = None,
                         meta_info: List[str] = None,
                         tokens_range: List[Tuple[int, int]] = None,
                         frame: List[str] = None,
                         pos_tag: List[str] = None,
                         node_label: List[str] = None,
                         gold_mrps: List[str] = None) -> Instance:

        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        token_field = TextField([Token(t) for t in tokens], self._token_indexers)
        fields["tokens"] = token_field
        meta_dict = {"tokens": tokens}

        if lemmas is not None and self._lemma_indexers is not None:
            fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers)

        if mrp_pos_tags is not None:
            fields["mrp_pos_tags"] = SequenceLabelField(mrp_pos_tags, token_field, label_namespace="pos")

        if frame is not None:
            fields["frame"] = SequenceLabelField(frame, token_field, label_namespace="frame")

        if pos_tag is not None:
            fields["pos_tag"] = SequenceLabelField(pos_tag, token_field, label_namespace="pos_tag")

        if node_label is not None:
            fields["node_label"] = SequenceLabelField(node_label, token_field, label_namespace="node_label")

        if arc_indices is not None and arc_tags is not None:
            meta_dict["arc_indices"] = arc_indices
            meta_dict["arc_tags"] = arc_tags
            fields["arc_tags"] = TextField([Token(a) for a in arc_tags], self._arc_tag_indexers)

        if gold_actions is not None:
            meta_dict["gold_actions"] = gold_actions
            fields["gold_actions"] = TextField([Token(a) for a in gold_actions], self._action_indexers)

        if meta_info is not None:
            meta_dict["meta_info"] = meta_info[0]

        if tokens_range is not None:
            meta_dict["tokens_range"] = tokens_range

        if gold_mrps is not None:
            meta_dict["gold_mrps"] = gold_mrps[0]

        fields["metadata"] = MetadataField(meta_dict)
        return Instance(fields)

Source File: transition_amr_reader.py From HIT-SCIR-CoNLL2019 with Apache License 2.0

4 votes

def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         lemmas: List[str] = None,
                         pos_tags: List[str] = None,
                         gold_actions: List[List[str]] = None,
                         id: str = None,
                         amr: str = None,
                         input: str = None,
                         mrp: str = None,
                         companion: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        token_field = TextField([Token(t) for t in tokens], self._token_indexers)
        fields["tokens"] = token_field
        meta_dict = {"tokens": tokens}
        if id:
            meta_dict["id"] = id
        if amr:
            meta_dict["amr"] = amr
        if input:
            meta_dict["input"] = input
        if mrp:
            meta_dict["mrp"] = json.loads(mrp)
        if companion:
            meta_dict["companion"] = json.loads(companion)

        if lemmas is not None and self._lemma_indexers is not None:
            fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers)
        if pos_tags is not None:
            fields["pos_tags"] = SequenceLabelField(pos_tags, token_field, label_namespace="pos")

        if gold_actions is not None:
            meta_dict["gold_actions"] = ['@@:@@'.join(a) for a in gold_actions]
            fields["gold_actions"] = TextField([Token('@@:@@'.join(a)) for a in gold_actions],
                                               {'actions': SingleIdTokenIndexer(namespace='actions')})
            fields["gold_newnodes"] = TextField(
                [Token(a[1] if a[0] == 'NEWNODE' else DEFAULT_PADDING_TOKEN) for a in gold_actions],
                {'newnodes': SingleIdTokenIndexer(namespace='newnodes')})
            fields["gold_entities"] = TextField(
                [Token(a[1] if a[0] == 'ENTITY' else DEFAULT_PADDING_TOKEN) for a in gold_actions],
                {'entities': SingleIdTokenIndexer(namespace='entities')})
            fields["gold_relations"] = TextField(
                [Token(a[1] if a[0] in ['LEFT', 'RIGHT'] else DEFAULT_PADDING_TOKEN) for a in gold_actions],
                {'relations': SingleIdTokenIndexer(namespace='relations')})
        fields["metadata"] = MetadataField(meta_dict)

        return Instance(fields)

Source File: input_reduction.py From allennlp with Apache License 2.0

4 votes

def _remove_one_token(
    instance: Instance,
    input_field_to_attack: str,
    grads: np.ndarray,
    ignore_tokens: List[str],
    beam_size: int,
    tag_mask: List[int],
) -> List[Tuple[Instance, int, List[int]]]:
    """
    Finds the token with the smallest gradient and removes it.
    """
    # Compute L2 norm of all grads.
    grads_mag = [np.sqrt(grad.dot(grad)) for grad in grads]

    # Skip all ignore_tokens by setting grad to infinity
    text_field: TextField = instance[input_field_to_attack]  # type: ignore
    for token_idx, token in enumerate(text_field.tokens):
        if token in ignore_tokens:
            grads_mag[token_idx] = float("inf")

    # For NER, skip all tokens that are not in outside
    if "tags" in instance:
        tag_field: SequenceLabelField = instance["tags"]  # type: ignore
        labels: List[str] = tag_field.labels  # type: ignore
        for idx, label in enumerate(labels):
            if label != "O":
                grads_mag[idx] = float("inf")
    reduced_instances_and_smallest: List[Tuple[Instance, int, List[int]]] = []
    for _ in range(beam_size):
        # copy instance and edit later
        copied_instance = deepcopy(instance)
        copied_text_field: TextField = copied_instance[input_field_to_attack]  # type: ignore

        # find smallest
        smallest = np.argmin(grads_mag)
        if grads_mag[smallest] == float("inf"):  # if all are ignored tokens, return.
            break
        grads_mag[smallest] = float("inf")  # so the other beams don't use this token

        # remove smallest
        inputs_before_smallest = copied_text_field.tokens[0:smallest]
        inputs_after_smallest = copied_text_field.tokens[smallest + 1 :]
        copied_text_field.tokens = inputs_before_smallest + inputs_after_smallest

        if "tags" in instance:
            tag_field: SequenceLabelField = copied_instance["tags"]  # type: ignore
            tag_field_before_smallest = tag_field.labels[0:smallest]
            tag_field_after_smallest = tag_field.labels[smallest + 1 :]
            tag_field.labels = tag_field_before_smallest + tag_field_after_smallest  # type: ignore
            tag_field.sequence_field = copied_text_field

        copied_instance.indexed = False
        reduced_instances_and_smallest.append((copied_instance, smallest, tag_mask))

    return reduced_instances_and_smallest

Python allennlp.data.fields.SequenceLabelField() Examples