Python Examples of allennlp.data.fields.LabelField

Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0

6 votes

def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         venue: str = None,
                         section_name: str = None) -> Instance:

        citation_tokens = self._tokenizer.tokenize(citation_text)

        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if section_name is not None:
            fields['section_label'] = LabelField(section_name, label_namespace="section_labels")
        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        return Instance(fields)

Source File: dataset_reader.py From ConvLab with MIT License

6 votes

def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
        intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        if tags:
            fields["tags"] = SequenceLabelField(tags, sequence)
        if domain:
            fields["domain"] = LabelField(domain, label_namespace="domain_labels")
        if intent:
            fields["intent"] = LabelField(intent, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields)

Source File: prolocal_dataset_reader.py From propara with Apache License 2.0

6 votes

def text_to_instance(self,  # type: ignore
                         sentence_tokens: List[str],
                         verb_vector: List[int],
                         entity_vector: List[int],
                         state_change_types: Optional[List[str]] = None,
                         state_change_tags: Optional[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        # encode inputs
        token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers)
        fields['tokens'] = token_field
        fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags')
        fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags')

        # encode outputs
        if state_change_types:
            fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels')
        if state_change_tags:
            fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags')

        return Instance(fields)

Source File: snli.py From magnitude with MIT License

6 votes

def text_to_instance(self,  # type: ignore
                         premise     ,
                         hypothesis     ,
                         label      = None)            :
        # pylint: disable=arguments-differ
        fields                   = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)
        fields[u'premise'] = TextField(premise_tokens, self._token_indexers)
        fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
        if label:
            fields[u'label'] = LabelField(label)

        metadata = {u"premise_tokens": [x.text for x in premise_tokens],
                    u"hypothesis_tokens": [x.text for x in hypothesis_tokens]}
        fields[u"metadata"] = MetadataField(metadata)
        return Instance(fields)

Source File: template_text2sql.py From allennlp-semparse with Apache License 2.0

6 votes

def text_to_instance(
        self,  # type: ignore
        query: List[str],
        slot_tags: List[str] = None,
        sql_template: str = None,
    ) -> Instance:
        fields: Dict[str, Field] = {}
        tokens = TextField([Token(t) for t in query], self._token_indexers)
        fields["tokens"] = tokens

        if slot_tags is not None and sql_template is not None:
            slot_field = SequenceLabelField(slot_tags, tokens, label_namespace="slot_tags")
            template = LabelField(sql_template, label_namespace="template_labels")
            fields["slot_tags"] = slot_field
            fields["template"] = template

        return Instance(fields)

Source File: entailment_pair.py From multee with Apache License 2.0

6 votes

def text_to_instance(self, # pylint: disable=arguments-differ
                         premise: str,
                         hypothesis: str,
                         label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        premise_tokens = [Token(token.text)
                          for token in self._tokenizer.tokenize(premise)[-self._max_tokens:]]
        hypothesis_tokens = [Token(token.text)
                             for token in self._tokenizer.tokenize(hypothesis)[-self._max_tokens:]]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)

        if label:
            fields['label'] = LabelField(label)

        # metadata = {"premise_tokens": [x.text for x in premise_tokens],
        #             "hypothesis_tokens": [x.text for x in hypothesis_tokens]}
        # fields["metadata"] = MetadataField(metadata)
        return Instance(fields)

Source File: instance_test.py From allennlp with Apache License 2.0

6 votes

def test_duplicate(self):
        # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
        # a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
        instance = Instance(
            {
                "words": TextField(
                    [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
                )
            }
        )

        other = instance.duplicate()
        assert other == instance

        # Adding new fields to the original instance should not effect the duplicate.
        instance.add_field("labels", LabelField("some_label"))
        assert "labels" not in other.fields
        assert other != instance  # sanity check on the '__eq__' method.

Source File: citation_data_reader_aclarc_aux.py From scicite with Apache License 2.0

6 votes

def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         cleaned_cite_text: str = None,
                         section_name: str = None,
                         is_citation: bool = None) -> Instance:

        citation_tokens = self._tokenizer.tokenize(citation_text)
        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if is_citation is not None:
            fields['is_citation'] = LabelField(str(is_citation), label_namespace="cite_worthiness_labels")
        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        return Instance(fields)

Source File: citation_data_reader_scicite_aux.py From scicite with Apache License 2.0

6 votes

def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         section_name: str = None) -> Instance:

        citation_tokens = self._tokenizer.tokenize(citation_text)

        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if section_name is not None:
            fields['section_label'] = LabelField(section_name, label_namespace="section_labels")

        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        return Instance(fields)

Source File: dataset_reader.py From swagaf with MIT License

6 votes

def text_to_instance(self,  # type: ignore
                         premise: str,
                         hypotheses: List[str],
                         label: int = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        fields['premise'] = TextField(premise_tokens, self._token_indexers)

        # This could be another way to get randomness
        for i, hyp in enumerate(hypotheses):
            hypothesis_tokens = self._tokenizer.tokenize(hyp)
            fields['hypothesis{}'.format(i)] = TextField(hypothesis_tokens, self._token_indexers)

        if label is not None:
            fields['label'] = LabelField(label, skip_indexing=True)
        return Instance(fields)

Source File: dataset_reader.py From swagaf with MIT License

6 votes

def text_to_instance(self,  # type: ignore
                         premise: str,
                         hypotheses: List[str],
                         label: int = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        # premise_tokens = self._tokenizer.tokenize(premise)
        # fields['premise'] = TextField(premise_tokens, self._token_indexers)

        # This could be another way to get randomness
        for i, hyp in enumerate(hypotheses):
            hypothesis_tokens = self._tokenizer.tokenize(hyp)
            fields['hypothesis{}'.format(i)] = TextField(hypothesis_tokens, self._token_indexers)

        if label is not None:
            fields['label'] = LabelField(label, skip_indexing=True)
        return Instance(fields)

Source File: arc_multichoice_json_reader.py From ARC-Solvers with Apache License 2.0

6 votes

def text_to_instance(self,  # type: ignore
                         item_id: Any,
                         question_text: str,
                         choice_text_list: List[str],
                         answer_id: int) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question_text)
        choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list]
        fields['question'] = TextField(question_tokens, self._token_indexers)
        fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list])
        fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
           "id": item_id,
           "question_text": question_text,
           "choice_text_list": choice_text_list,
           "question_tokens": [x.text for x in question_tokens],
           "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
        }

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)

Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0

6 votes

def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         entity_1: Tuple[int],
                         entity_2: Tuple[int],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
            
        sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields)

Source File: entailment_tuple_reader.py From ARC-Solvers with Apache License 2.0

6 votes

def text_to_instance(self,
                         premise: str,
                         hypothesis: str,
                         hypothesis_structure: str,
                         label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        premise_tokens = self._tokenizer.tokenize(premise)[-self._max_tokens:]
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)[-self._max_tokens:]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
        metadata = {
            'premise': premise,
            'hypothesis': hypothesis,
            'premise_tokens': [token.text for token in premise_tokens],
            'hypothesis_tokens': [token.text for token in hypothesis_tokens]
        }
        fields['metadata'] = MetadataField(metadata)
        self._add_structure_to_fields(hypothesis_structure, fields)
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)

Source File: data_loading.py From teaching with GNU General Public License v3.0

6 votes

def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = LabelField(int(query_id), skip_indexing=True)
        doc_id_field = LabelField(int(doc_id), skip_indexing=True)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]

        doc_field = TextField(doc_tokenized, self._token_indexers)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field})

Source File: dataset_reader.py From swagaf with MIT License

6 votes

def text_to_instance(self,  # type: ignore
                         premise: str,
                         hypotheses: List[str],
                         label: int = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        fields['premise'] = TextField(premise_tokens, self._token_indexers)

        # This could be another way to get randomness
        for i, hyp in enumerate(hypotheses):
            hypothesis_tokens = self._tokenizer.tokenize(hyp)
            fields['hypothesis{}'.format(i)] = TextField(hypothesis_tokens, self._token_indexers)

        if label is not None:
            fields['label'] = LabelField(label, skip_indexing=True)
        return Instance(fields)

Source File: sent_sim_data.py From glyce with Apache License 2.0

5 votes

def text_to_instance(self, s1: str, s2: str, label: str = None, split_token: str = None) -> Instance:  # type: ignore
        tokens1 = self._tokenizer.tokenize(s1)
        tokens2 = self._tokenizer.tokenize(s2)
        # self._truncate_seq_pair(tokens1, tokens2, self.max_tokens)
        tokens1_field = TextField(tokens1, self._token_indexers)
        tokens2_field = TextField(tokens2, self._token_indexers)
        fields = {'premise': tokens1_field, 'hypothesis': tokens2_field}
        if label is not None:
            fields['label'] = LabelField(label)
        return Instance(fields)

Source File: list_field_test.py From magnitude with MIT License

5 votes

def test_nested_list_fields_are_padded_correctly(self):
        nested_field1 = ListField([LabelField(c) for c in [u'a', u'b', u'c', u'd', u'e']])
        nested_field2 = ListField([LabelField(c) for c in [u'f', u'g', u'h', u'i', u'j', u'k']])
        list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        assert padding_lengths == {u'num_fields': 3, u'list_num_fields': 6}
        tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1],
                                                   [0, 1, 2, 3, 4, -1],
                                                   [5, 6, 7, 8, 9, 10]])

Source File: bert_pretraining_reader.py From kb with Apache License 2.0

5 votes

def text_to_instance(self,
                        sentence1: str,
                        sentence2: str,
                        next_sentence_label: int):

        fields = self._tokenizer_masker.tokenize_candidates_mask(sentence1, sentence2)

        # NSP label field
        fields['next_sentence_label'] = \
            LabelField(next_sentence_label, skip_indexing=True)

        return Instance(fields)

Source File: tacred_dataset_reader.py From kb with Apache License 2.0

5 votes

def text_to_instance(self,
                         sentence: Iterable[str],
                         relation: str,
                         subj_start: int,
                         subj_end: int,
                         obj_start: int,
                         obj_end: int) -> Instance:
        """
        Following approach in:
            https://openreview.net/forum?id=BJgrxbqp67
        We modify the input to look like:
            [CLS] subj [SEP] obj [SEP] sentence [SEP]
        """
        token_candidates = self.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sentence)

        if self.entity_masking == 'type/role/segment':
            offsets = [1] + token_candidates['offsets_a'][:-1]
            segment_ids = list(token_candidates['segment_ids'])
            for s, e, ii in [[subj_start, subj_end+1, 1], [obj_start, obj_end+1, 2]]:
                ll = offsets[e] - offsets[s]
                segment_ids[offsets[s]:offsets[e]] = [ii] * ll
            # the type + [SEP]
            segment_ids[-4:-2] = [1, 1]
            segment_ids[-2:] = [2, 2]
            token_candidates['segment_ids'] = segment_ids

        # get the indices of the entity starts
        offsets = [1] + token_candidates['offsets_a'][:-1]
        idx1_offset = offsets[subj_start]
        idx2_offset = offsets[obj_start]

        fields = self.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates)
        fields['label_ids'] = LabelField(LABEL_MAP[relation], skip_indexing=True)
        fields['index_a'] = LabelField(idx1_offset, skip_indexing=True)
        fields['index_b'] = LabelField(idx2_offset, skip_indexing=True)

        return Instance(fields)

Source File: ultra_fine_reader.py From kb with Apache License 2.0

5 votes

def text_to_instance(self, sentence, span, labels, index_entity_start):
        token_candidates = self.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sentence, span)
        fields = self.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates)
        fields['label_ids'] = ArrayField(np.array(labels), dtype=np.int)

        # index of entity start
        if index_entity_start is not None:
            offsets = [1] + token_candidates['offsets_a'][:-1]
            idx1_offset = offsets[index_entity_start]
            fields['index_a'] = LabelField(idx1_offset, skip_indexing=True)

        return Instance(fields)

Source File: entity_linking.py From kb with Apache License 2.0

5 votes

def _combine_instances(self, instance_a, instance_b, nsp_label, gold_cache):
        text_a = ' '.join([t.text for t in instance_a['tokens'].tokens])
        text_b = ' '.join([t.text for t in instance_b['tokens'].tokens])

        fields = self.tokenizer_and_masker.tokenize_candidates_mask(text_a, text_b)
        candidate_spans = [
            [s.span_start, s.span_end]
            for s in fields['candidates'].field_dict[self.id_type].field_dict['candidate_spans'].field_list
        ]
        assert sorted(candidate_spans) == candidate_spans

        # combine the gold entities
        golds = []
        for text in [text_a, text_b]:
            golds.append(gold_cache[text])

        combined_golds = []
        j = [-1, -1]
        for span in candidate_spans:
            i = fields['segment_ids'].array[span[0]]
            j[i] += 1
            combined_golds.append(golds[i][j[i]])

        gold_text_field = TextField(
            [Token(g) for g in combined_golds],
            token_indexers=self.entity_indexer
        )
        fields['gold_entities'] = DictField({self.id_type: gold_text_field})

        if self.use_nsp_label:
            fields['next_sentence_label'] = LabelField(nsp_label, skip_indexing=True)

        del fields['lm_label_ids']

        return Instance(fields)

Source File: cls_data.py From glyce with Apache License 2.0

5 votes

def text_to_instance(self, sentence: str, label: str) -> Instance:
        sentence_tokens = self._tokenizer.tokenize(sentence)
        if len(sentence) > self.max_sentence_length:
            sentence_tokens = sentence_tokens[:self.max_sentence_length]
            self.trimmed_count += 1
        sentence_field = TextField(sentence_tokens, self._token_indexers)
        fields = {'sentence': sentence_field}
        if label is not None:
            fields['label'] = LabelField(label)
        return Instance(fields)

Source File: ir_triple_loader.py From sigir19-neural-ir with Apache License 2.0

5 votes

def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence)
        #doc_pos_tokenized.insert(0, Token(START_SYMBOL))
        #doc_pos_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length]

        doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers)

        doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence)
        #doc_neg_tokenized.insert(0, Token(START_SYMBOL))
        #doc_neg_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length]

        doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers)

        query_length = LabelField(len(query_tokenized), skip_indexing=True)
        doc_pos_length = LabelField(len(doc_pos_tokenized), skip_indexing=True)
        doc_neg_length = LabelField(len(doc_neg_tokenized), skip_indexing=True)

        return Instance({
            "query_tokens":query_field,
            "doc_pos_tokens":doc_pos_field,
            "doc_neg_tokens": doc_neg_field,
            "query_length":query_length,
            "doc_pos_length":doc_pos_length,
            "doc_neg_length":doc_neg_length})

Source File: ir_labeled_tuple_loader.py From sigir19-neural-ir with Apache License 2.0

5 votes

def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = LabelField(int(query_id), skip_indexing=True)
        doc_id_field = LabelField(int(doc_id), skip_indexing=True)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        #doc_tokenized.insert(0, Token(START_SYMBOL))
        #doc_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]

        doc_field = TextField(doc_tokenized, self._token_indexers)

        query_length = LabelField(len(query_tokenized), skip_indexing=True)
        doc_length = LabelField(len(doc_tokenized), skip_indexing=True)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field,
            "query_length":query_length,
            "doc_length":doc_length})

Source File: imdb_review_reader.py From topic-rnn with Apache License 2.0

5 votes

def _read(self, file_path):
        # A training instance consists of the word frequencies for the entire review and a
        # `words_per_instance`` portion of the review.
        file_path = cached_path(file_path)

        # Partition each review into BPTT Limit + 1 chunks to allow room for input (chunk[:-1])
        # and output (chunk[1:]).
        # Break up the text into a series of BPTT chunks and yield one at a time.
        num_tokens = self._words_per_instance + 1
        with open(cached_path(file_path), 'r') as data_file:
            logger.info("Reading instances from lines in file: %s", file_path)
            for line in data_file:
                line = line.strip("\n")
                if not line:
                    continue
                example = ujson.loads(line)
                example_text = example['text']
                example_text_tokenized = self._tokenizer.tokenize(example_text)
                example_sentiment = "positive" if example['sentiment'] >= 5 else "negative"
                example_sentiment_field = LabelField(example_sentiment)

                # Each review will receive the entire encoded review.
                frequency_field = TextField(example_text_tokenized, self._token_indexers)
                tokenized_strings = []
                for index in range(0, len(example_text_tokenized) - num_tokens, num_tokens - 1):
                    tokenized_strings.append(example_text_tokenized[index:(index + num_tokens)])

                    # By breaking early when training a classifier, we prevent training on duplicates.
                    if self._classification_mode:
                        break

                for tokenized_string in tokenized_strings:
                    input_field = TextField(tokenized_string[:-1], self._token_indexers)
                    output_field = TextField(tokenized_string[1:], self._token_indexers)
                    yield Instance({'input_tokens': input_field,
                                    'output_tokens': output_field,
                                    'frequency_tokens': frequency_field,
                                    'sentiment': example_sentiment_field})

Source File: bert_reader_sent_selection.py From semanticRetrievalMRS with MIT License

5 votes

def text_to_instance(self,  # type: ignore
                         sent1: str,  # Important type information
                         sent2: str,
                         pid: str = None,
                         label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(sent1)
        tokenized_text2 = self.bert_tokenizer.tokenize(sent2)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.query_l]
        tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))]

        joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]']
        text1_len = len(tokenized_text1) + 2
        text2_len = len(tokenized_text2) + 1
        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2))

        fields['bert_s1_span'] = MetadataField(text1_span)
        fields['bert_s2_span'] = MetadataField(text2_span)

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields)

Source File: bert_fever_reader.py From semanticRetrievalMRS with MIT License

5 votes

def text_to_instance(self,  # type: ignore
                         sent1: str,  # Important type information
                         sent2: str,
                         pid: str = None,
                         label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(sent1)
        tokenized_text2 = self.bert_tokenizer.tokenize(sent2)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.s1_l]
        tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))]

        joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]']
        text1_len = len(tokenized_text1) + 2
        text2_len = len(tokenized_text2) + 1
        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields)

Source File: entailment_tuple_reader.py From ARC-Solvers with Apache License 2.0

5 votes

def _add_structure_to_fields(self, structure, fields) -> None:
        """
        Add structure (nodes and edges) to the instance fields. Specifically, convert
        "plants<>produce<>oxygen" into ("produce", subj, "plants"), ("produce", obj, "oxygen"),
        ("plants", subj-obj, "oxygen"). Each quoted string forms a node represented using a
        TextField. Each source and target node in an edge is represented using IndexField into
        the list of nodes and the edge label is represented using a LabelField with "edges"
        namespace.
        """
        # take the last tuples
        tuples = structure.split("$$$")[-self._max_tuples:]
        node_list, edge_list = self._extract_nodes_and_edges_from_tuples(tuples)
        if not len(node_list):
            print("No nodes in {} for premise:{} and hypothesis: {}".format(
                structure, fields['metadata'].metadata["premise"],
                fields['metadata'].metadata["hypothesis"]))
        nodes_field = ListField(node_list)
        edge_source_list = []
        edge_target_list = []
        edge_label_list = []
        for edge in edge_list:
            source_field = IndexField(edge[0], nodes_field)
            target_field = IndexField(edge[2], nodes_field)
            label_field = LabelField(edge[1], "edges")
            edge_source_list.append(source_field)
            edge_target_list.append(target_field)
            edge_label_list.append(label_field)
        fields['nodes'] = nodes_field
        # Currently AllenNLP doesn't allow for ListFields containing ListFields,
        # so creating separate ListFields for source, target and labels for the edges
        fields['edge_sources'] = ListField(edge_source_list)
        fields['edge_targets'] = ListField(edge_target_list)
        fields['edge_labels'] = ListField(edge_label_list)

Source File: semisupervised_text_classification_json.py From vampire with Apache License 2.0

5 votes

def text_to_instance(self, text: str, label: str = None) -> Instance:  # type: ignore
        """
        Parameters
        ----------
        text : ``str``, required.
            The text to classify
        label ``str``, optional, (default = None).
            The label for this text.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence or phrase.
            label : ``LabelField``
                The label label of the sentence or phrase.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        tokens = self._tokenizer.tokenize(text)
        if self._max_sequence_length is not None:
            tokens = self._truncate(tokens)
        fields['tokens'] = TextField(tokens, self._token_indexers)
        if label is not None:
            fields['label'] = LabelField(label,
                                         skip_indexing=self._skip_label_indexing)
        return Instance(fields)

Python allennlp.data.fields.LabelField() Examples