Python allennlp.data.fields.SequenceLabelField() Examples
The following are 28
code examples of allennlp.data.fields.SequenceLabelField().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.fields
, or try the search function
.
Example #1
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None, intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence if tags: fields["tags"] = SequenceLabelField(tags, sequence) if domain: fields["domain"] = LabelField(domain, label_namespace="domain_labels") if intent: fields["intent"] = LabelField(intent, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #2
Source File: summarization_sentence_tagger_reader.py From summarus with Apache License 2.0 | 6 votes |
def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance: if sentences is None: if self._language == "ru": sentences = [s.text for s in razdel.sentenize(text)] else: sentences = nltk.tokenize.sent_tokenize(text) sentences_tokens = [] for sentence in sentences[:self._max_sentences_count]: sentence = sentence.lower() if self._lowercase else sentence tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens] tokens.insert(0, Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) indexed_tokens = TextField(tokens, self._source_token_indexers) sentences_tokens.append(indexed_tokens) sentences_tokens_indexed = ListField(sentences_tokens) result = {'source_sentences': sentences_tokens_indexed} if tags: result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed) return Instance(result)
Example #3
Source File: template_text2sql.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def text_to_instance( self, # type: ignore query: List[str], slot_tags: List[str] = None, sql_template: str = None, ) -> Instance: fields: Dict[str, Field] = {} tokens = TextField([Token(t) for t in query], self._token_indexers) fields["tokens"] = tokens if slot_tags is not None and sql_template is not None: slot_field = SequenceLabelField(slot_tags, tokens, label_namespace="slot_tags") template = LabelField(sql_template, label_namespace="template_labels") fields["slot_tags"] = slot_field fields["template"] = template return Instance(fields)
Example #4
Source File: prolocal_dataset_reader.py From propara with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore sentence_tokens: List[str], verb_vector: List[int], entity_vector: List[int], state_change_types: Optional[List[str]] = None, state_change_tags: Optional[List[str]] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # encode inputs token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers) fields['tokens'] = token_field fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags') fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags') # encode outputs if state_change_types: fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels') if state_change_tags: fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags') return Instance(fields)
Example #5
Source File: dataset_reader.py From nanigonet with MIT License | 6 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str]=None) -> Instance: if len(tokens) > self._max_token_len: tokens = tokens[:self._max_token_len] print(f'Length of tokens exceeded the limit {self._max_token_len}. Truncating...') if tags: tags = tags[:self._max_token_len] fields = {} text_field = TextField(tokens, self._token_indexers) fields['tokens'] = text_field if tags: fields['tags'] = SequenceLabelField(tags, text_field) return Instance(fields)
Example #6
Source File: semantic_role_labeling.py From magnitude with MIT License | 6 votes |
def text_to_instance(self, # type: ignore tokens , verb_label , tags = None) : u""" We take `pre-tokenized` input here, along with a verb label. The verb label should be a one-hot binary vector, the same length as the tokens, indicating the position of the verb to find arguments for. """ # pylint: disable=arguments-differ fields = {} text_field = TextField(tokens, token_indexers=self._token_indexers) fields[u'tokens'] = text_field fields[u'verb_indicator'] = SequenceLabelField(verb_label, text_field) if tags: fields[u'tags'] = SequenceLabelField(tags, text_field) if all([x == 0 for x in verb_label]): verb = None else: verb = tokens[verb_label.index(1)].text fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens], u"verb": verb}) return Instance(fields)
Example #7
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None, intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # print([t.text for t in context_tokens]) fields["context_tokens"] = TextField(context_tokens, self._token_indexers) fields["tokens"] = TextField(tokens, self._token_indexers) fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) if tags is not None: fields["tags"] = SequenceLabelField(tags, fields["tokens"]) if intents is not None: fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #8
Source File: vcr.py From r2c with MIT License | 5 votes |
def _fix_tokenization(tokenized_sent, bert_embs, old_det_to_new_ind, obj_to_type, token_indexers, pad_ind=-1): """ Turn a detection list into what we want: some text, as well as some tags. :param tokenized_sent: Tokenized sentence with detections collapsed to a list. :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag) :param obj_to_type: [person, person, pottedplant] indexed by the old labels :return: tokenized sentence """ new_tokenization_with_tags = [] for tok in tokenized_sent: if isinstance(tok, list): for int_name in tok: obj_type = obj_to_type[int_name] new_ind = old_det_to_new_ind[int_name] if new_ind < 0: raise ValueError("Oh no, the new index is negative! that means it's invalid. {} {}".format( tokenized_sent, old_det_to_new_ind )) text_to_use = GENDER_NEUTRAL_NAMES[ new_ind % len(GENDER_NEUTRAL_NAMES)] if obj_type == 'person' else obj_type new_tokenization_with_tags.append((text_to_use, new_ind)) else: new_tokenization_with_tags.append((tok, pad_ind)) text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags], bert_embs, padding_value=0) tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field) return text_field, tags
Example #9
Source File: depend_parse.py From glyce with Apache License 2.0 | 5 votes |
def text_to_instance(self, # type: ignore words: List[str], upos_tags: List[str], dependencies: List[Tuple[str, int]] = None) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- words : ``List[str]``, required. The words in the sentence to be encoded. upos_tags : ``List[str]``, required. The universal dependencies POS tags for each word. dependencies ``List[Tuple[str, int]]``, optional (default = None) A list of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. Returns ------- An instance containing words, upos tags, dependency head tags and head indices as fields. """ fields: Dict[str, Field] = {} tokens = TextField([Token(w) for w in words], self._token_indexers) fields["words"] = tokens fields["pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace="pos") if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies], tokens, label_namespace="head_tags") fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies], tokens, label_namespace="head_index_tags") fields["metadata"] = MetadataField({"words": words, "pos": upos_tags}) return Instance(fields)
Example #10
Source File: rebalance_dataset_ensemble.py From swagaf with MIT License | 5 votes |
def collate(self, items_l): # Assume all of these have the same length index_l, second_sentences_l, pos_tags_l, feats_l, context_len_l = zip(*items_l) feats = Variable(torch.FloatTensor(np.stack(feats_l))) inds = np.array(index_l) instances = [] for second_sentences, pos_tags, context_len in zip(second_sentences_l, pos_tags_l, context_len_l): for second_sent, pos_tag in zip(second_sentences, pos_tags): instance_d = { 'words': TextField([Token(token) for token in ['@@bos@@'] + second_sent + ['@@eos@@']], {'tokens': SingleIdTokenIndexer(namespace='tokens', lowercase_tokens=True)}), 'postags': TextField([Token(token) for token in ['@@bos@@'] + pos_tag + ['@@eos@@']], {'pos': SingleIdTokenIndexer(namespace='pos', lowercase_tokens=False)}), } instance_d['context_indicator'] = SequenceLabelField([1] * (context_len + 1) + [0] * (len(second_sent) - context_len + 1), instance_d['words']) instances.append(Instance(instance_d)) batch = Batch(instances) batch.index_instances(vocab) tensor_dict = batch.as_tensor_dict(for_training=self.train) # instances_mask = torch.LongTensor(np.stack([np.array([len(sub_g) > 0 for sub_g in g], dtype=np.int64) # for g in selected_gens])) return { 'lm_feats': feats, 'inds': inds, 'ending_word_ids': tensor_dict['words']['tokens'].view(inds.shape[0], -1, tensor_dict['words']['tokens'].size(1)), 'postags_word_ids': tensor_dict['postags']['pos'].view(inds.shape[0], -1, tensor_dict['postags']['pos'].size(1)), 'ctx_indicator': tensor_dict['context_indicator'].view(inds.shape[0], -1, tensor_dict['context_indicator'].size(1)), }
Example #11
Source File: conll_reader.py From allennlp_tutorial with MIT License | 5 votes |
def text_to_instance(self, words: List[str], ner_tags: List[str]) -> Instance: fields: Dict[str, Field] = {} # wrap each token in the file with a token object tokens = TextField([Token(w) for w in words], self._token_indexers) # Instances in AllenNLP are created using Python dictionaries, # which map the token key to the Field type fields["tokens"] = tokens fields["label"] = SequenceLabelField(ner_tags, tokens) return Instance(fields)
Example #12
Source File: vcr.py From HGL-pytorch with MIT License | 5 votes |
def _fix_tokenization(tokenized_sent, bert_embs, old_det_to_new_ind, obj_to_type, token_indexers, pad_ind=-1): """ Turn a detection list into what we want: some text, as well as some tags. :param tokenized_sent: Tokenized sentence with detections collapsed to a list. :param old_det_to_new_ind: Mapping of the old ID -> new ID (which will be used as the tag) :param obj_to_type: [person, person, pottedplant] indexed by the old labels :return: tokenized sentence """ new_tokenization_with_tags = [] for tok in tokenized_sent: if isinstance(tok, list): for int_name in tok: obj_type = obj_to_type[int_name] new_ind = old_det_to_new_ind[int_name] if new_ind < 0: raise ValueError("Oh no, the new index is negative! that means it's invalid. {} {}".format( tokenized_sent, old_det_to_new_ind )) text_to_use = GENDER_NEUTRAL_NAMES[ new_ind % len(GENDER_NEUTRAL_NAMES)] if obj_type == 'person' else obj_type new_tokenization_with_tags.append((text_to_use, new_ind)) else: new_tokenization_with_tags.append((tok, pad_ind)) text_field = BertField([Token(x[0]) for x in new_tokenization_with_tags], bert_embs, padding_value=0) tags = SequenceLabelField([x[1] for x in new_tokenization_with_tags], text_field) return text_field, tags
Example #13
Source File: ebmnlp.py From scibert with Apache License 2.0 | 5 votes |
def text_to_instance(self, tokens: List[Token], pico_tags: List[str] = None): sequence = TextField(tokens, self._token_indexers) instance_fields: Dict[str, Field] = {'tokens': sequence} instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) # Set the field 'labels' according to the specified PIO element if pico_tags is not None: instance_fields['tags'] = SequenceLabelField(pico_tags, sequence, self.label_namespace) return Instance(instance_fields)
Example #14
Source File: relation_instances_reader.py From comb_dist_direct_relex with Apache License 2.0 | 5 votes |
def _tokens_distances_fields(self, tokens): """Returns the updated list of tokens and entity distances for the first and second entity as fields.""" tokens, positions1, positions2 = self._tokens_distances(tokens) t_f = TextField(tokens, self._token_indexers) p1_f = SequenceLabelField(positions1, t_f) p2_f = SequenceLabelField(positions2, t_f) return t_f, p1_f, p2_f
Example #15
Source File: sequence_tagging.py From allennlp with Apache License 2.0 | 5 votes |
def text_to_instance( # type: ignore self, tokens: List[Token], tags: List[str] = None ) -> Instance: """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence fields["metadata"] = MetadataField({"words": [x.text for x in tokens]}) if tags is not None: fields["tags"] = SequenceLabelField(tags, sequence) return Instance(fields)
Example #16
Source File: list_field_test.py From magnitude with MIT License | 5 votes |
def setUp(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace(u"this", u"words") self.vocab.add_token_to_namespace(u"is", u"words") self.vocab.add_token_to_namespace(u"a", u"words") self.vocab.add_token_to_namespace(u"sentence", u'words') self.vocab.add_token_to_namespace(u"s", u'characters') self.vocab.add_token_to_namespace(u"e", u'characters') self.vocab.add_token_to_namespace(u"n", u'characters') self.vocab.add_token_to_namespace(u"t", u'characters') self.vocab.add_token_to_namespace(u"c", u'characters') for label in [u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k']: self.vocab.add_token_to_namespace(label, u'labels') self.word_indexer = {u"words": SingleIdTokenIndexer(u"words")} self.words_and_characters_indexers = {u"words": SingleIdTokenIndexer(u"words"), u"characters": TokenCharactersIndexer(u"characters")} self.field1 = TextField([Token(t) for t in [u"this", u"is", u"a", u"sentence"]], self.word_indexer) self.field2 = TextField([Token(t) for t in [u"this", u"is", u"a", u"different", u"sentence"]], self.word_indexer) self.field3 = TextField([Token(t) for t in [u"this", u"is", u"another", u"sentence"]], self.word_indexer) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() super(TestListField, self).setUp()
Example #17
Source File: universal_dependencies.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, # type: ignore words , upos_tags , dependencies = None) : # pylint: disable=arguments-differ u""" Parameters ---------- words : ``List[str]``, required. The words in the sentence to be encoded. upos_tags : ``List[str]``, required. The universal dependencies POS tags for each word. dependencies ``List[Tuple[str, int]]``, optional (default = None) A list of (head tag, head index) tuples. Indices are 1 indexed, meaning an index of 0 corresponds to that word being the root of the dependency tree. Returns ------- An instance containing words, upos tags, dependency head tags and head indices as fields. """ fields = {} tokens = TextField([Token(w) for w in words], self._token_indexers) fields[u"words"] = tokens fields[u"pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace=u"pos") if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields[u"head_tags"] = SequenceLabelField([x[0] for x in dependencies], tokens, label_namespace=u"head_tags") fields[u"head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies], tokens, label_namespace=u"head_index_tags") fields[u"metadata"] = MetadataField({u"words": words, u"pos": upos_tags}) return Instance(fields)
Example #18
Source File: datareader.py From NLP_Toolkit with Apache License 2.0 | 5 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, words: List[str] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence fields["metadata"] = MetadataField({"words": words}) if tags is not None: labels, detect_tags, complex_flag_dict = self.extract_tags(tags) if self._skip_complex and complex_flag_dict[self._skip_complex] > 0: return None rnd = random() # skip TN if self._skip_correct and all(x == "CORRECT" for x in detect_tags): if rnd > self._tn_prob: return None # skip TP else: if rnd > self._tp_prob: return None fields["labels"] = SequenceLabelField(labels, sequence, label_namespace="labels") fields["d_tags"] = SequenceLabelField(detect_tags, sequence, label_namespace="d_tags") return Instance(fields)
Example #19
Source File: sequence_tagging.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, tokens , tags = None) : # type: ignore u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields = {} sequence = TextField(tokens, self._token_indexers) fields[u"tokens"] = sequence fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]}) if tags is not None: fields[u"tags"] = SequenceLabelField(tags, sequence) return Instance(fields)
Example #20
Source File: ontonotes_ner.py From magnitude with MIT License | 5 votes |
def text_to_instance(self, # type: ignore tokens , ner_tags = None) : u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields = {u'tokens': sequence} instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]}) # Add "tag label" to instance if ner_tags is not None: if self._coding_scheme == u"BIOUL": ner_tags = to_bioul(ner_tags, encoding=u"BIO") instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence) return Instance(instance_fields)
Example #21
Source File: input_reduction.py From allennlp with Apache License 2.0 | 5 votes |
def _get_ner_tags_and_mask( instance: Instance, input_field_to_attack: str, ignore_tokens: List[str] ): """ Used for the NER task. Sets the num_ignore tokens, saves the original predicted tag and a 0/1 mask in the position of the tags """ # Set num_ignore_tokens num_ignore_tokens = 0 input_field: TextField = instance[input_field_to_attack] # type: ignore for token in input_field.tokens: if str(token) in ignore_tokens: num_ignore_tokens += 1 # save the original tags and a 0/1 mask where the tags are tag_mask = [] original_tags = [] tag_field: SequenceLabelField = instance["tags"] # type: ignore for label in tag_field.labels: if label != "O": tag_mask.append(1) original_tags.append(label) num_ignore_tokens += 1 else: tag_mask.append(0) return num_ignore_tokens, tag_mask, original_tags
Example #22
Source File: conll2003.py From magnitude with MIT License | 4 votes |
def text_to_instance(self, # type: ignore tokens , pos_tags = None, chunk_tags = None, ner_tags = None) : u""" We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ sequence = TextField(tokens, self._token_indexers) instance_fields = {u'tokens': sequence} instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]}) # Recode the labels if necessary. if self.coding_scheme == u"BIOUL": coded_chunks = to_bioul(chunk_tags) if chunk_tags is not None else None coded_ner = to_bioul(ner_tags) if ner_tags is not None else None else: # the default IOB1 coded_chunks = chunk_tags coded_ner = ner_tags # Add "feature labels" to instance if u'pos' in self.feature_labels: if pos_tags is None: raise ConfigurationError(u"Dataset reader was specified to use pos_tags as " u"features. Pass them to text_to_instance.") instance_fields[u'pos_tags'] = SequenceLabelField(pos_tags, sequence, u"pos_tags") if u'chunk' in self.feature_labels: if coded_chunks is None: raise ConfigurationError(u"Dataset reader was specified to use chunk tags as " u"features. Pass them to text_to_instance.") instance_fields[u'chunk_tags'] = SequenceLabelField(coded_chunks, sequence, u"chunk_tags") if u'ner' in self.feature_labels: if coded_ner is None: raise ConfigurationError(u"Dataset reader was specified to use NER tags as " u" features. Pass them to text_to_instance.") instance_fields[u'ner_tags'] = SequenceLabelField(coded_ner, sequence, u"ner_tags") # Add "tag label" to instance if self.tag_label == u'ner' and coded_ner is not None: instance_fields[u'tags'] = SequenceLabelField(coded_ner, sequence) elif self.tag_label == u'pos' and pos_tags is not None: instance_fields[u'tags'] = SequenceLabelField(pos_tags, sequence) elif self.tag_label == u'chunk' and coded_chunks is not None: instance_fields[u'tags'] = SequenceLabelField(coded_chunks, sequence) return Instance(instance_fields)
Example #23
Source File: universal_dependencies.py From udify with MIT License | 4 votes |
def text_to_instance(self, # type: ignore words: List[str], lemmas: List[str] = None, lemma_rules: List[str] = None, upos_tags: List[str] = None, xpos_tags: List[str] = None, feats: List[str] = None, dependencies: List[Tuple[str, int]] = None, ids: List[str] = None, multiword_ids: List[str] = None, multiword_forms: List[str] = None) -> Instance: fields: Dict[str, Field] = {} tokens = TextField([Token(w) for w in words], self._token_indexers) fields["tokens"] = tokens names = ["upos", "xpos", "feats", "lemmas"] all_tags = [upos_tags, xpos_tags, feats, lemma_rules] for name, field in zip(names, all_tags): if field: fields[name] = SequenceLabelField(field, tokens, label_namespace=name) if dependencies is not None: # We don't want to expand the label namespace with an additional dummy token, so we'll # always give the 'ROOT_HEAD' token a label of 'root'. fields["head_tags"] = SequenceLabelField([x[0] for x in dependencies], tokens, label_namespace="head_tags") fields["head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies], tokens, label_namespace="head_index_tags") fields["metadata"] = MetadataField({ "words": words, "upos_tags": upos_tags, "xpos_tags": xpos_tags, "feats": feats, "lemmas": lemmas, "lemma_rules": lemma_rules, "ids": ids, "multiword_ids": multiword_ids, "multiword_forms": multiword_forms }) return Instance(fields)
Example #24
Source File: list_field_test.py From allennlp with Apache License 2.0 | 4 votes |
def setup_method(self): self.vocab = Vocabulary() self.vocab.add_token_to_namespace("this", "words") self.vocab.add_token_to_namespace("is", "words") self.vocab.add_token_to_namespace("a", "words") self.vocab.add_token_to_namespace("sentence", "words") self.vocab.add_token_to_namespace("s", "characters") self.vocab.add_token_to_namespace("e", "characters") self.vocab.add_token_to_namespace("n", "characters") self.vocab.add_token_to_namespace("t", "characters") self.vocab.add_token_to_namespace("c", "characters") for label in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]: self.vocab.add_token_to_namespace(label, "labels") self.word_indexer = {"words": SingleIdTokenIndexer("words")} self.words_and_characters_indexers = { "words": SingleIdTokenIndexer("words"), "characters": TokenCharactersIndexer("characters", min_padding_length=1), } self.field1 = TextField( [Token(t) for t in ["this", "is", "a", "sentence"]], self.word_indexer ) self.field2 = TextField( [Token(t) for t in ["this", "is", "a", "different", "sentence"]], self.word_indexer ) self.field3 = TextField( [Token(t) for t in ["this", "is", "another", "sentence"]], self.word_indexer ) self.empty_text_field = self.field1.empty_field() self.index_field = IndexField(1, self.field1) self.empty_index_field = self.index_field.empty_field() self.sequence_label_field = SequenceLabelField([1, 1, 0, 1], self.field1) self.empty_sequence_label_field = self.sequence_label_field.empty_field() tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) empty_list_field = ListField([text_field.empty_field()]) empty_fields = {"list_tensor": empty_list_field} self.empty_instance = Instance(empty_fields) non_empty_list_field = ListField([text_field]) non_empty_fields = {"list_tensor": non_empty_list_field} self.non_empty_instance = Instance(non_empty_fields) super().setup_method()
Example #25
Source File: transition_eds_reader.py From HIT-SCIR-CoNLL2019 with Apache License 2.0 | 4 votes |
def text_to_instance(self, # type: ignore tokens: List[str], lemmas: List[str] = None, pos_tags: List[str] = None, arc_indices: List[Tuple[int, int]] = None, arc_tags: List[str] = None, gold_actions: List[str] = None, root_id: List[int] = None, meta_info: List[str] = None, concept_label: List[int] = None, tokens_range: List[Tuple[int, int]] = None, gold_mrps: List[str] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} token_field = TextField([Token(t) for t in tokens], self._token_indexers) fields["tokens"] = token_field meta_dict = {"tokens": tokens} if lemmas is not None and self._lemma_indexers is not None: fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers) if pos_tags is not None: fields["pos_tags"] = SequenceLabelField(pos_tags, token_field, label_namespace="pos") if arc_indices is not None and arc_tags is not None: meta_dict["arc_indices"] = arc_indices meta_dict["arc_tags"] = arc_tags fields["arc_tags"] = TextField([Token(a) for a in arc_tags], self._arc_tag_indexers) if gold_actions is not None: meta_dict["gold_actions"] = gold_actions fields["gold_actions"] = TextField([Token(a) for a in gold_actions], self._action_indexers) if meta_info is not None: meta_dict["meta_info"] = meta_info[0] if gold_mrps is not None: meta_dict["gold_mrps"] = gold_mrps[0] if tokens_range is not None: meta_dict["tokens_range"] = tokens_range if concept_label is not None: meta_dict["concept_label"] = concept_label fields["concept_label"] = TextField([Token(a) for a in concept_label], self._concept_label_indexers) if root_id is not None: meta_dict["root_id"] = root_id[0] fields["metadata"] = MetadataField(meta_dict) return Instance(fields)
Example #26
Source File: transition_sdp_reader.py From HIT-SCIR-CoNLL2019 with Apache License 2.0 | 4 votes |
def text_to_instance(self, # type: ignore tokens: List[str], lemmas: List[str] = None, mrp_pos_tags: List[str] = None, arc_indices: List[Tuple[int, int]] = None, arc_tags: List[str] = None, gold_actions: List[str] = None, meta_info: List[str] = None, tokens_range: List[Tuple[int, int]] = None, frame: List[str] = None, pos_tag: List[str] = None, node_label: List[str] = None, gold_mrps: List[str] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} token_field = TextField([Token(t) for t in tokens], self._token_indexers) fields["tokens"] = token_field meta_dict = {"tokens": tokens} if lemmas is not None and self._lemma_indexers is not None: fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers) if mrp_pos_tags is not None: fields["mrp_pos_tags"] = SequenceLabelField(mrp_pos_tags, token_field, label_namespace="pos") if frame is not None: fields["frame"] = SequenceLabelField(frame, token_field, label_namespace="frame") if pos_tag is not None: fields["pos_tag"] = SequenceLabelField(pos_tag, token_field, label_namespace="pos_tag") if node_label is not None: fields["node_label"] = SequenceLabelField(node_label, token_field, label_namespace="node_label") if arc_indices is not None and arc_tags is not None: meta_dict["arc_indices"] = arc_indices meta_dict["arc_tags"] = arc_tags fields["arc_tags"] = TextField([Token(a) for a in arc_tags], self._arc_tag_indexers) if gold_actions is not None: meta_dict["gold_actions"] = gold_actions fields["gold_actions"] = TextField([Token(a) for a in gold_actions], self._action_indexers) if meta_info is not None: meta_dict["meta_info"] = meta_info[0] if tokens_range is not None: meta_dict["tokens_range"] = tokens_range if gold_mrps is not None: meta_dict["gold_mrps"] = gold_mrps[0] fields["metadata"] = MetadataField(meta_dict) return Instance(fields)
Example #27
Source File: transition_amr_reader.py From HIT-SCIR-CoNLL2019 with Apache License 2.0 | 4 votes |
def text_to_instance(self, # type: ignore tokens: List[str], lemmas: List[str] = None, pos_tags: List[str] = None, gold_actions: List[List[str]] = None, id: str = None, amr: str = None, input: str = None, mrp: str = None, companion: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} token_field = TextField([Token(t) for t in tokens], self._token_indexers) fields["tokens"] = token_field meta_dict = {"tokens": tokens} if id: meta_dict["id"] = id if amr: meta_dict["amr"] = amr if input: meta_dict["input"] = input if mrp: meta_dict["mrp"] = json.loads(mrp) if companion: meta_dict["companion"] = json.loads(companion) if lemmas is not None and self._lemma_indexers is not None: fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers) if pos_tags is not None: fields["pos_tags"] = SequenceLabelField(pos_tags, token_field, label_namespace="pos") if gold_actions is not None: meta_dict["gold_actions"] = ['@@:@@'.join(a) for a in gold_actions] fields["gold_actions"] = TextField([Token('@@:@@'.join(a)) for a in gold_actions], {'actions': SingleIdTokenIndexer(namespace='actions')}) fields["gold_newnodes"] = TextField( [Token(a[1] if a[0] == 'NEWNODE' else DEFAULT_PADDING_TOKEN) for a in gold_actions], {'newnodes': SingleIdTokenIndexer(namespace='newnodes')}) fields["gold_entities"] = TextField( [Token(a[1] if a[0] == 'ENTITY' else DEFAULT_PADDING_TOKEN) for a in gold_actions], {'entities': SingleIdTokenIndexer(namespace='entities')}) fields["gold_relations"] = TextField( [Token(a[1] if a[0] in ['LEFT', 'RIGHT'] else DEFAULT_PADDING_TOKEN) for a in gold_actions], {'relations': SingleIdTokenIndexer(namespace='relations')}) fields["metadata"] = MetadataField(meta_dict) return Instance(fields)
Example #28
Source File: input_reduction.py From allennlp with Apache License 2.0 | 4 votes |
def _remove_one_token( instance: Instance, input_field_to_attack: str, grads: np.ndarray, ignore_tokens: List[str], beam_size: int, tag_mask: List[int], ) -> List[Tuple[Instance, int, List[int]]]: """ Finds the token with the smallest gradient and removes it. """ # Compute L2 norm of all grads. grads_mag = [np.sqrt(grad.dot(grad)) for grad in grads] # Skip all ignore_tokens by setting grad to infinity text_field: TextField = instance[input_field_to_attack] # type: ignore for token_idx, token in enumerate(text_field.tokens): if token in ignore_tokens: grads_mag[token_idx] = float("inf") # For NER, skip all tokens that are not in outside if "tags" in instance: tag_field: SequenceLabelField = instance["tags"] # type: ignore labels: List[str] = tag_field.labels # type: ignore for idx, label in enumerate(labels): if label != "O": grads_mag[idx] = float("inf") reduced_instances_and_smallest: List[Tuple[Instance, int, List[int]]] = [] for _ in range(beam_size): # copy instance and edit later copied_instance = deepcopy(instance) copied_text_field: TextField = copied_instance[input_field_to_attack] # type: ignore # find smallest smallest = np.argmin(grads_mag) if grads_mag[smallest] == float("inf"): # if all are ignored tokens, return. break grads_mag[smallest] = float("inf") # so the other beams don't use this token # remove smallest inputs_before_smallest = copied_text_field.tokens[0:smallest] inputs_after_smallest = copied_text_field.tokens[smallest + 1 :] copied_text_field.tokens = inputs_before_smallest + inputs_after_smallest if "tags" in instance: tag_field: SequenceLabelField = copied_instance["tags"] # type: ignore tag_field_before_smallest = tag_field.labels[0:smallest] tag_field_after_smallest = tag_field.labels[smallest + 1 :] tag_field.labels = tag_field_before_smallest + tag_field_after_smallest # type: ignore tag_field.sequence_field = copied_text_field copied_instance.indexed = False reduced_instances_and_smallest.append((copied_instance, smallest, tag_mask)) return reduced_instances_and_smallest