Python allennlp.data.fields.LabelField() Examples
The following are 30
code examples of allennlp.data.fields.LabelField().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.fields
, or try the search function
.
Example #1
Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0 | 6 votes |
def text_to_instance(self, citation_text: str, citing_paper_id: str, cited_paper_id: str, intent: List[str] = None, venue: str = None, section_name: str = None) -> Instance: citation_tokens = self._tokenizer.tokenize(citation_text) fields = { 'citation_text': TextField(citation_tokens, self._token_indexers), } if section_name is not None: fields['section_label'] = LabelField(section_name, label_namespace="section_labels") fields['citing_paper_id'] = MetadataField(citing_paper_id) fields['cited_paper_id'] = MetadataField(cited_paper_id) return Instance(fields)
Example #2
Source File: dataset_reader.py From ConvLab with MIT License | 6 votes |
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None, intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance: # type: ignore """ We take `pre-tokenized` input here, because we don't have a tokenizer in this class. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} sequence = TextField(tokens, self._token_indexers) fields["tokens"] = sequence if tags: fields["tags"] = SequenceLabelField(tags, sequence) if domain: fields["domain"] = LabelField(domain, label_namespace="domain_labels") if intent: fields["intent"] = LabelField(intent, label_namespace="intent_labels") if dialog_act is not None: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': dialog_act}) else: fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}}) return Instance(fields)
Example #3
Source File: prolocal_dataset_reader.py From propara with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore sentence_tokens: List[str], verb_vector: List[int], entity_vector: List[int], state_change_types: Optional[List[str]] = None, state_change_tags: Optional[List[str]] = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # encode inputs token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers) fields['tokens'] = token_field fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags') fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags') # encode outputs if state_change_types: fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels') if state_change_tags: fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags') return Instance(fields)
Example #4
Source File: snli.py From magnitude with MIT License | 6 votes |
def text_to_instance(self, # type: ignore premise , hypothesis , label = None) : # pylint: disable=arguments-differ fields = {} premise_tokens = self._tokenizer.tokenize(premise) hypothesis_tokens = self._tokenizer.tokenize(hypothesis) fields[u'premise'] = TextField(premise_tokens, self._token_indexers) fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) if label: fields[u'label'] = LabelField(label) metadata = {u"premise_tokens": [x.text for x in premise_tokens], u"hypothesis_tokens": [x.text for x in hypothesis_tokens]} fields[u"metadata"] = MetadataField(metadata) return Instance(fields)
Example #5
Source File: template_text2sql.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def text_to_instance( self, # type: ignore query: List[str], slot_tags: List[str] = None, sql_template: str = None, ) -> Instance: fields: Dict[str, Field] = {} tokens = TextField([Token(t) for t in query], self._token_indexers) fields["tokens"] = tokens if slot_tags is not None and sql_template is not None: slot_field = SequenceLabelField(slot_tags, tokens, label_namespace="slot_tags") template = LabelField(sql_template, label_namespace="template_labels") fields["slot_tags"] = slot_field fields["template"] = template return Instance(fields)
Example #6
Source File: entailment_pair.py From multee with Apache License 2.0 | 6 votes |
def text_to_instance(self, # pylint: disable=arguments-differ premise: str, hypothesis: str, label: str = None) -> Instance: fields: Dict[str, Field] = {} premise_tokens = [Token(token.text) for token in self._tokenizer.tokenize(premise)[-self._max_tokens:]] hypothesis_tokens = [Token(token.text) for token in self._tokenizer.tokenize(hypothesis)[-self._max_tokens:]] fields['premise'] = TextField(premise_tokens, self._token_indexers) fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) if label: fields['label'] = LabelField(label) # metadata = {"premise_tokens": [x.text for x in premise_tokens], # "hypothesis_tokens": [x.text for x in hypothesis_tokens]} # fields["metadata"] = MetadataField(metadata) return Instance(fields)
Example #7
Source File: instance_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_duplicate(self): # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in # a `TextField`. See https://github.com/allenai/allennlp/issues/4270. instance = Instance( { "words": TextField( [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")} ) } ) other = instance.duplicate() assert other == instance # Adding new fields to the original instance should not effect the duplicate. instance.add_field("labels", LabelField("some_label")) assert "labels" not in other.fields assert other != instance # sanity check on the '__eq__' method.
Example #8
Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0 | 6 votes |
def text_to_instance(self, citation_text: str, citing_paper_id: str, cited_paper_id: str, intent: List[str] = None, cleaned_cite_text: str = None, section_name: str = None, is_citation: bool = None) -> Instance: citation_tokens = self._tokenizer.tokenize(citation_text) fields = { 'citation_text': TextField(citation_tokens, self._token_indexers), } if is_citation is not None: fields['is_citation'] = LabelField(str(is_citation), label_namespace="cite_worthiness_labels") fields['citing_paper_id'] = MetadataField(citing_paper_id) fields['cited_paper_id'] = MetadataField(cited_paper_id) return Instance(fields)
Example #9
Source File: citation_data_reader_scicite_aux.py From scicite with Apache License 2.0 | 6 votes |
def text_to_instance(self, citation_text: str, citing_paper_id: str, cited_paper_id: str, intent: List[str] = None, section_name: str = None) -> Instance: citation_tokens = self._tokenizer.tokenize(citation_text) fields = { 'citation_text': TextField(citation_tokens, self._token_indexers), } if section_name is not None: fields['section_label'] = LabelField(section_name, label_namespace="section_labels") fields['citing_paper_id'] = MetadataField(citing_paper_id) fields['cited_paper_id'] = MetadataField(cited_paper_id) return Instance(fields)
Example #10
Source File: dataset_reader.py From swagaf with MIT License | 6 votes |
def text_to_instance(self, # type: ignore premise: str, hypotheses: List[str], label: int = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} premise_tokens = self._tokenizer.tokenize(premise) fields['premise'] = TextField(premise_tokens, self._token_indexers) # This could be another way to get randomness for i, hyp in enumerate(hypotheses): hypothesis_tokens = self._tokenizer.tokenize(hyp) fields['hypothesis{}'.format(i)] = TextField(hypothesis_tokens, self._token_indexers) if label is not None: fields['label'] = LabelField(label, skip_indexing=True) return Instance(fields)
Example #11
Source File: dataset_reader.py From swagaf with MIT License | 6 votes |
def text_to_instance(self, # type: ignore premise: str, hypotheses: List[str], label: int = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} # premise_tokens = self._tokenizer.tokenize(premise) # fields['premise'] = TextField(premise_tokens, self._token_indexers) # This could be another way to get randomness for i, hyp in enumerate(hypotheses): hypothesis_tokens = self._tokenizer.tokenize(hyp) fields['hypothesis{}'.format(i)] = TextField(hypothesis_tokens, self._token_indexers) if label is not None: fields['label'] = LabelField(label, skip_indexing=True) return Instance(fields)
Example #12
Source File: arc_multichoice_json_reader.py From ARC-Solvers with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore item_id: Any, question_text: str, choice_text_list: List[str], answer_id: int) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self._tokenizer.tokenize(question_text) choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list] fields['question'] = TextField(question_tokens, self._token_indexers) fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list]) fields['label'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "choice_text_list": choice_text_list, "question_tokens": [x.text for x in question_tokens], "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list], } fields["metadata"] = MetadataField(metadata) return Instance(fields)
Example #13
Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore tokens: List[str], entity_1: Tuple[int], entity_2: Tuple[int], label: str = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = [OpenAISplitter._standardize(token) for token in tokens] tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__'] sentence = TextField([Token(text=t) for t in tokens], self._token_indexers) fields['sentence'] = sentence #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence) #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence) if label: fields['label'] = LabelField(label) return Instance(fields)
Example #14
Source File: entailment_tuple_reader.py From ARC-Solvers with Apache License 2.0 | 6 votes |
def text_to_instance(self, premise: str, hypothesis: str, hypothesis_structure: str, label: str = None) -> Instance: fields: Dict[str, Field] = {} premise_tokens = self._tokenizer.tokenize(premise)[-self._max_tokens:] hypothesis_tokens = self._tokenizer.tokenize(hypothesis)[-self._max_tokens:] fields['premise'] = TextField(premise_tokens, self._token_indexers) fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers) metadata = { 'premise': premise, 'hypothesis': hypothesis, 'premise_tokens': [token.text for token in premise_tokens], 'hypothesis_tokens': [token.text for token in hypothesis_tokens] } fields['metadata'] = MetadataField(metadata) self._add_structure_to_fields(hypothesis_structure, fields) if label: fields['label'] = LabelField(label) return Instance(fields)
Example #15
Source File: data_loading.py From teaching with GNU General Public License v3.0 | 6 votes |
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_id_field = LabelField(int(query_id), skip_indexing=True) doc_id_field = LabelField(int(doc_id), skip_indexing=True) query_tokenized = self._tokenizer.tokenize(query_sequence) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_tokenized = self._tokenizer.tokenize(doc_sequence) if self.max_doc_length > -1: doc_tokenized = doc_tokenized[:self.max_doc_length] doc_field = TextField(doc_tokenized, self._token_indexers) return Instance({ "query_id":query_id_field, "doc_id":doc_id_field, "query_tokens":query_field, "doc_tokens":doc_field})
Example #16
Source File: dataset_reader.py From swagaf with MIT License | 6 votes |
def text_to_instance(self, # type: ignore premise: str, hypotheses: List[str], label: int = None) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} premise_tokens = self._tokenizer.tokenize(premise) fields['premise'] = TextField(premise_tokens, self._token_indexers) # This could be another way to get randomness for i, hyp in enumerate(hypotheses): hypothesis_tokens = self._tokenizer.tokenize(hyp) fields['hypothesis{}'.format(i)] = TextField(hypothesis_tokens, self._token_indexers) if label is not None: fields['label'] = LabelField(label, skip_indexing=True) return Instance(fields)
Example #17
Source File: sent_sim_data.py From glyce with Apache License 2.0 | 5 votes |
def text_to_instance(self, s1: str, s2: str, label: str = None, split_token: str = None) -> Instance: # type: ignore tokens1 = self._tokenizer.tokenize(s1) tokens2 = self._tokenizer.tokenize(s2) # self._truncate_seq_pair(tokens1, tokens2, self.max_tokens) tokens1_field = TextField(tokens1, self._token_indexers) tokens2_field = TextField(tokens2, self._token_indexers) fields = {'premise': tokens1_field, 'hypothesis': tokens2_field} if label is not None: fields['label'] = LabelField(label) return Instance(fields)
Example #18
Source File: list_field_test.py From magnitude with MIT License | 5 votes |
def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in [u'a', u'b', u'c', u'd', u'e']]) nested_field2 = ListField([LabelField(c) for c in [u'f', u'g', u'h', u'i', u'j', u'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {u'num_fields': 3, u'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]])
Example #19
Source File: bert_pretraining_reader.py From kb with Apache License 2.0 | 5 votes |
def text_to_instance(self, sentence1: str, sentence2: str, next_sentence_label: int): fields = self._tokenizer_masker.tokenize_candidates_mask(sentence1, sentence2) # NSP label field fields['next_sentence_label'] = \ LabelField(next_sentence_label, skip_indexing=True) return Instance(fields)
Example #20
Source File: tacred_dataset_reader.py From kb with Apache License 2.0 | 5 votes |
def text_to_instance(self, sentence: Iterable[str], relation: str, subj_start: int, subj_end: int, obj_start: int, obj_end: int) -> Instance: """ Following approach in: https://openreview.net/forum?id=BJgrxbqp67 We modify the input to look like: [CLS] subj [SEP] obj [SEP] sentence [SEP] """ token_candidates = self.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sentence) if self.entity_masking == 'type/role/segment': offsets = [1] + token_candidates['offsets_a'][:-1] segment_ids = list(token_candidates['segment_ids']) for s, e, ii in [[subj_start, subj_end+1, 1], [obj_start, obj_end+1, 2]]: ll = offsets[e] - offsets[s] segment_ids[offsets[s]:offsets[e]] = [ii] * ll # the type + [SEP] segment_ids[-4:-2] = [1, 1] segment_ids[-2:] = [2, 2] token_candidates['segment_ids'] = segment_ids # get the indices of the entity starts offsets = [1] + token_candidates['offsets_a'][:-1] idx1_offset = offsets[subj_start] idx2_offset = offsets[obj_start] fields = self.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates) fields['label_ids'] = LabelField(LABEL_MAP[relation], skip_indexing=True) fields['index_a'] = LabelField(idx1_offset, skip_indexing=True) fields['index_b'] = LabelField(idx2_offset, skip_indexing=True) return Instance(fields)
Example #21
Source File: ultra_fine_reader.py From kb with Apache License 2.0 | 5 votes |
def text_to_instance(self, sentence, span, labels, index_entity_start): token_candidates = self.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sentence, span) fields = self.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates) fields['label_ids'] = ArrayField(np.array(labels), dtype=np.int) # index of entity start if index_entity_start is not None: offsets = [1] + token_candidates['offsets_a'][:-1] idx1_offset = offsets[index_entity_start] fields['index_a'] = LabelField(idx1_offset, skip_indexing=True) return Instance(fields)
Example #22
Source File: entity_linking.py From kb with Apache License 2.0 | 5 votes |
def _combine_instances(self, instance_a, instance_b, nsp_label, gold_cache): text_a = ' '.join([t.text for t in instance_a['tokens'].tokens]) text_b = ' '.join([t.text for t in instance_b['tokens'].tokens]) fields = self.tokenizer_and_masker.tokenize_candidates_mask(text_a, text_b) candidate_spans = [ [s.span_start, s.span_end] for s in fields['candidates'].field_dict[self.id_type].field_dict['candidate_spans'].field_list ] assert sorted(candidate_spans) == candidate_spans # combine the gold entities golds = [] for text in [text_a, text_b]: golds.append(gold_cache[text]) combined_golds = [] j = [-1, -1] for span in candidate_spans: i = fields['segment_ids'].array[span[0]] j[i] += 1 combined_golds.append(golds[i][j[i]]) gold_text_field = TextField( [Token(g) for g in combined_golds], token_indexers=self.entity_indexer ) fields['gold_entities'] = DictField({self.id_type: gold_text_field}) if self.use_nsp_label: fields['next_sentence_label'] = LabelField(nsp_label, skip_indexing=True) del fields['lm_label_ids'] return Instance(fields)
Example #23
Source File: cls_data.py From glyce with Apache License 2.0 | 5 votes |
def text_to_instance(self, sentence: str, label: str) -> Instance: sentence_tokens = self._tokenizer.tokenize(sentence) if len(sentence) > self.max_sentence_length: sentence_tokens = sentence_tokens[:self.max_sentence_length] self.trimmed_count += 1 sentence_field = TextField(sentence_tokens, self._token_indexers) fields = {'sentence': sentence_field} if label is not None: fields['label'] = LabelField(label) return Instance(fields)
Example #24
Source File: ir_triple_loader.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_tokenized = self._tokenizer.tokenize(query_sequence) #if self._source_add_start_token: # query_tokenized.insert(0, Token(START_SYMBOL)) #query_tokenized.append(Token(END_SYMBOL)) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence) #doc_pos_tokenized.insert(0, Token(START_SYMBOL)) #doc_pos_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length] doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers) doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence) #doc_neg_tokenized.insert(0, Token(START_SYMBOL)) #doc_neg_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length] doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers) query_length = LabelField(len(query_tokenized), skip_indexing=True) doc_pos_length = LabelField(len(doc_pos_tokenized), skip_indexing=True) doc_neg_length = LabelField(len(doc_neg_tokenized), skip_indexing=True) return Instance({ "query_tokens":query_field, "doc_pos_tokens":doc_pos_field, "doc_neg_tokens": doc_neg_field, "query_length":query_length, "doc_pos_length":doc_pos_length, "doc_neg_length":doc_neg_length})
Example #25
Source File: ir_labeled_tuple_loader.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance: # type: ignore # pylint: disable=arguments-differ query_id_field = LabelField(int(query_id), skip_indexing=True) doc_id_field = LabelField(int(doc_id), skip_indexing=True) query_tokenized = self._tokenizer.tokenize(query_sequence) #if self._source_add_start_token: # query_tokenized.insert(0, Token(START_SYMBOL)) #query_tokenized.append(Token(END_SYMBOL)) if self.max_query_length > -1: query_tokenized = query_tokenized[:self.max_query_length] query_field = TextField(query_tokenized, self._token_indexers) doc_tokenized = self._tokenizer.tokenize(doc_sequence) #doc_tokenized.insert(0, Token(START_SYMBOL)) #doc_tokenized.append(Token(END_SYMBOL)) if self.max_doc_length > -1: doc_tokenized = doc_tokenized[:self.max_doc_length] doc_field = TextField(doc_tokenized, self._token_indexers) query_length = LabelField(len(query_tokenized), skip_indexing=True) doc_length = LabelField(len(doc_tokenized), skip_indexing=True) return Instance({ "query_id":query_id_field, "doc_id":doc_id_field, "query_tokens":query_field, "doc_tokens":doc_field, "query_length":query_length, "doc_length":doc_length})
Example #26
Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0 | 5 votes |
def _read(self, file_path): # A training instance consists of the word frequencies for the entire review and a # `words_per_instance`` portion of the review. file_path = cached_path(file_path) # Partition each review into BPTT Limit + 1 chunks to allow room for input (chunk[:-1]) # and output (chunk[1:]). # Break up the text into a series of BPTT chunks and yield one at a time. num_tokens = self._words_per_instance + 1 with open(cached_path(file_path), 'r') as data_file: logger.info("Reading instances from lines in file: %s", file_path) for line in data_file: line = line.strip("\n") if not line: continue example = ujson.loads(line) example_text = example['text'] example_text_tokenized = self._tokenizer.tokenize(example_text) example_sentiment = "positive" if example['sentiment'] >= 5 else "negative" example_sentiment_field = LabelField(example_sentiment) # Each review will receive the entire encoded review. frequency_field = TextField(example_text_tokenized, self._token_indexers) tokenized_strings = [] for index in range(0, len(example_text_tokenized) - num_tokens, num_tokens - 1): tokenized_strings.append(example_text_tokenized[index:(index + num_tokens)]) # By breaking early when training a classifier, we prevent training on duplicates. if self._classification_mode: break for tokenized_string in tokenized_strings: input_field = TextField(tokenized_string[:-1], self._token_indexers) output_field = TextField(tokenized_string[1:], self._token_indexers) yield Instance({'input_tokens': input_field, 'output_tokens': output_field, 'frequency_tokens': frequency_field, 'sentiment': example_sentiment_field})
Example #27
Source File: bert_reader_sent_selection.py From semanticRetrievalMRS with MIT License | 5 votes |
def text_to_instance(self, # type: ignore sent1: str, # Important type information sent2: str, pid: str = None, label: str = None) -> Instance: fields: Dict[str, Field] = {} tokenized_text1 = self.bert_tokenizer.tokenize(sent1) tokenized_text2 = self.bert_tokenizer.tokenize(sent2) # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l) tokenized_text1 = tokenized_text1[:self.query_l] tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))] joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]'] text1_len = len(tokenized_text1) + 2 text2_len = len(tokenized_text2) + 1 segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64)) text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use) text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2)) fields['bert_s1_span'] = MetadataField(text1_span) fields['bert_s2_span'] = MetadataField(text2_span) if label: fields['label'] = LabelField(label, label_namespace='labels') if pid: fields['pid'] = IdField(pid) return Instance(fields)
Example #28
Source File: bert_fever_reader.py From semanticRetrievalMRS with MIT License | 5 votes |
def text_to_instance(self, # type: ignore sent1: str, # Important type information sent2: str, pid: str = None, label: str = None) -> Instance: fields: Dict[str, Field] = {} tokenized_text1 = self.bert_tokenizer.tokenize(sent1) tokenized_text2 = self.bert_tokenizer.tokenize(sent2) # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l) tokenized_text1 = tokenized_text1[:self.s1_l] tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))] joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]'] text1_len = len(tokenized_text1) + 2 text2_len = len(tokenized_text2) + 1 segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)] joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq) assert len(joint_tokens_ids) == len(segments_ids) fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64)) fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64)) text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use) text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2)) fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence']) fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence']) if label: fields['label'] = LabelField(label, label_namespace='labels') if pid: fields['pid'] = IdField(pid) return Instance(fields)
Example #29
Source File: entailment_tuple_reader.py From ARC-Solvers with Apache License 2.0 | 5 votes |
def _add_structure_to_fields(self, structure, fields) -> None: """ Add structure (nodes and edges) to the instance fields. Specifically, convert "plants<>produce<>oxygen" into ("produce", subj, "plants"), ("produce", obj, "oxygen"), ("plants", subj-obj, "oxygen"). Each quoted string forms a node represented using a TextField. Each source and target node in an edge is represented using IndexField into the list of nodes and the edge label is represented using a LabelField with "edges" namespace. """ # take the last tuples tuples = structure.split("$$$")[-self._max_tuples:] node_list, edge_list = self._extract_nodes_and_edges_from_tuples(tuples) if not len(node_list): print("No nodes in {} for premise:{} and hypothesis: {}".format( structure, fields['metadata'].metadata["premise"], fields['metadata'].metadata["hypothesis"])) nodes_field = ListField(node_list) edge_source_list = [] edge_target_list = [] edge_label_list = [] for edge in edge_list: source_field = IndexField(edge[0], nodes_field) target_field = IndexField(edge[2], nodes_field) label_field = LabelField(edge[1], "edges") edge_source_list.append(source_field) edge_target_list.append(target_field) edge_label_list.append(label_field) fields['nodes'] = nodes_field # Currently AllenNLP doesn't allow for ListFields containing ListFields, # so creating separate ListFields for source, target and labels for the edges fields['edge_sources'] = ListField(edge_source_list) fields['edge_targets'] = ListField(edge_target_list) fields['edge_labels'] = ListField(edge_label_list)
Example #30
Source File: semisupervised_text_classification_json.py From vampire with Apache License 2.0 | 5 votes |
def text_to_instance(self, text: str, label: str = None) -> Instance: # type: ignore """ Parameters ---------- text : ``str``, required. The text to classify label ``str``, optional, (default = None). The label for this text. Returns ------- An ``Instance`` containing the following fields: tokens : ``TextField`` The tokens in the sentence or phrase. label : ``LabelField`` The label label of the sentence or phrase. """ # pylint: disable=arguments-differ fields: Dict[str, Field] = {} tokens = self._tokenizer.tokenize(text) if self._max_sequence_length is not None: tokens = self._truncate(tokens) fields['tokens'] = TextField(tokens, self._token_indexers) if label is not None: fields['label'] = LabelField(label, skip_indexing=self._skip_label_indexing) return Instance(fields)