Python allennlp.data.fields.LabelField() Examples

The following are 30 code examples of allennlp.data.fields.LabelField(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.fields , or try the search function .
Example #1
Source File: citation_data_reader_aclarc_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         venue: str = None,
                         section_name: str = None) -> Instance:

        citation_tokens = self._tokenizer.tokenize(citation_text)

        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if section_name is not None:
            fields['section_label'] = LabelField(section_name, label_namespace="section_labels")
        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        return Instance(fields) 
Example #2
Source File: dataset_reader.py    From ConvLab with MIT License 6 votes vote down vote up
def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
        intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        if tags:
            fields["tags"] = SequenceLabelField(tags, sequence)
        if domain:
            fields["domain"] = LabelField(domain, label_namespace="domain_labels")
        if intent:
            fields["intent"] = LabelField(intent, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields) 
Example #3
Source File: prolocal_dataset_reader.py    From propara with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         sentence_tokens: List[str],
                         verb_vector: List[int],
                         entity_vector: List[int],
                         state_change_types: Optional[List[str]] = None,
                         state_change_tags: Optional[List[str]] = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        # encode inputs
        token_field = TextField([Token(word) for word in sentence_tokens], self._token_indexers)
        fields['tokens'] = token_field
        fields['verb_span'] = SequenceLabelField(verb_vector, token_field, 'indicator_tags')
        fields['entity_span'] = SequenceLabelField(entity_vector, token_field, 'indicator_tags')

        # encode outputs
        if state_change_types:
            fields['state_change_type_labels'] = LabelField(state_change_types, 'state_change_type_labels')
        if state_change_tags:
            fields['state_change_tags'] = SequenceLabelField(state_change_tags, token_field, 'state_change_tags')

        return Instance(fields) 
Example #4
Source File: snli.py    From magnitude with MIT License 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         premise     ,
                         hypothesis     ,
                         label      = None)            :
        # pylint: disable=arguments-differ
        fields                   = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)
        fields[u'premise'] = TextField(premise_tokens, self._token_indexers)
        fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
        if label:
            fields[u'label'] = LabelField(label)

        metadata = {u"premise_tokens": [x.text for x in premise_tokens],
                    u"hypothesis_tokens": [x.text for x in hypothesis_tokens]}
        fields[u"metadata"] = MetadataField(metadata)
        return Instance(fields) 
Example #5
Source File: template_text2sql.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def text_to_instance(
        self,  # type: ignore
        query: List[str],
        slot_tags: List[str] = None,
        sql_template: str = None,
    ) -> Instance:
        fields: Dict[str, Field] = {}
        tokens = TextField([Token(t) for t in query], self._token_indexers)
        fields["tokens"] = tokens

        if slot_tags is not None and sql_template is not None:
            slot_field = SequenceLabelField(slot_tags, tokens, label_namespace="slot_tags")
            template = LabelField(sql_template, label_namespace="template_labels")
            fields["slot_tags"] = slot_field
            fields["template"] = template

        return Instance(fields) 
Example #6
Source File: entailment_pair.py    From multee with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self, # pylint: disable=arguments-differ
                         premise: str,
                         hypothesis: str,
                         label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        premise_tokens = [Token(token.text)
                          for token in self._tokenizer.tokenize(premise)[-self._max_tokens:]]
        hypothesis_tokens = [Token(token.text)
                             for token in self._tokenizer.tokenize(hypothesis)[-self._max_tokens:]]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)

        if label:
            fields['label'] = LabelField(label)

        # metadata = {"premise_tokens": [x.text for x in premise_tokens],
        #             "hypothesis_tokens": [x.text for x in hypothesis_tokens]}
        # fields["metadata"] = MetadataField(metadata)
        return Instance(fields) 
Example #7
Source File: instance_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_duplicate(self):
        # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
        # a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
        instance = Instance(
            {
                "words": TextField(
                    [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
                )
            }
        )

        other = instance.duplicate()
        assert other == instance

        # Adding new fields to the original instance should not effect the duplicate.
        instance.add_field("labels", LabelField("some_label"))
        assert "labels" not in other.fields
        assert other != instance  # sanity check on the '__eq__' method. 
Example #8
Source File: citation_data_reader_aclarc_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         cleaned_cite_text: str = None,
                         section_name: str = None,
                         is_citation: bool = None) -> Instance:

        citation_tokens = self._tokenizer.tokenize(citation_text)
        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if is_citation is not None:
            fields['is_citation'] = LabelField(str(is_citation), label_namespace="cite_worthiness_labels")
        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        return Instance(fields) 
Example #9
Source File: citation_data_reader_scicite_aux.py    From scicite with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         section_name: str = None) -> Instance:

        citation_tokens = self._tokenizer.tokenize(citation_text)

        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if section_name is not None:
            fields['section_label'] = LabelField(section_name, label_namespace="section_labels")

        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        return Instance(fields) 
Example #10
Source File: dataset_reader.py    From swagaf with MIT License 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         premise: str,
                         hypotheses: List[str],
                         label: int = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        fields['premise'] = TextField(premise_tokens, self._token_indexers)

        # This could be another way to get randomness
        for i, hyp in enumerate(hypotheses):
            hypothesis_tokens = self._tokenizer.tokenize(hyp)
            fields['hypothesis{}'.format(i)] = TextField(hypothesis_tokens, self._token_indexers)

        if label is not None:
            fields['label'] = LabelField(label, skip_indexing=True)
        return Instance(fields) 
Example #11
Source File: dataset_reader.py    From swagaf with MIT License 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         premise: str,
                         hypotheses: List[str],
                         label: int = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        # premise_tokens = self._tokenizer.tokenize(premise)
        # fields['premise'] = TextField(premise_tokens, self._token_indexers)

        # This could be another way to get randomness
        for i, hyp in enumerate(hypotheses):
            hypothesis_tokens = self._tokenizer.tokenize(hyp)
            fields['hypothesis{}'.format(i)] = TextField(hypothesis_tokens, self._token_indexers)

        if label is not None:
            fields['label'] = LabelField(label, skip_indexing=True)
        return Instance(fields) 
Example #12
Source File: arc_multichoice_json_reader.py    From ARC-Solvers with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         item_id: Any,
                         question_text: str,
                         choice_text_list: List[str],
                         answer_id: int) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        question_tokens = self._tokenizer.tokenize(question_text)
        choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list]
        fields['question'] = TextField(question_tokens, self._token_indexers)
        fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list])
        fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
           "id": item_id,
           "question_text": question_text,
           "choice_text_list": choice_text_list,
           "question_tokens": [x.text for x in question_tokens],
           "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
        }

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields) 
Example #13
Source File: semeval_2010_task_8_reader.py    From DISTRE with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         entity_1: Tuple[int],
                         entity_2: Tuple[int],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
            
        sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields) 
Example #14
Source File: entailment_tuple_reader.py    From ARC-Solvers with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self,
                         premise: str,
                         hypothesis: str,
                         hypothesis_structure: str,
                         label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        premise_tokens = self._tokenizer.tokenize(premise)[-self._max_tokens:]
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)[-self._max_tokens:]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
        metadata = {
            'premise': premise,
            'hypothesis': hypothesis,
            'premise_tokens': [token.text for token in premise_tokens],
            'hypothesis_tokens': [token.text for token in hypothesis_tokens]
        }
        fields['metadata'] = MetadataField(metadata)
        self._add_structure_to_fields(hypothesis_structure, fields)
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields) 
Example #15
Source File: data_loading.py    From teaching with GNU General Public License v3.0 6 votes vote down vote up
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = LabelField(int(query_id), skip_indexing=True)
        doc_id_field = LabelField(int(doc_id), skip_indexing=True)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]

        doc_field = TextField(doc_tokenized, self._token_indexers)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field}) 
Example #16
Source File: dataset_reader.py    From swagaf with MIT License 6 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         premise: str,
                         hypotheses: List[str],
                         label: int = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        fields['premise'] = TextField(premise_tokens, self._token_indexers)

        # This could be another way to get randomness
        for i, hyp in enumerate(hypotheses):
            hypothesis_tokens = self._tokenizer.tokenize(hyp)
            fields['hypothesis{}'.format(i)] = TextField(hypothesis_tokens, self._token_indexers)

        if label is not None:
            fields['label'] = LabelField(label, skip_indexing=True)
        return Instance(fields) 
Example #17
Source File: sent_sim_data.py    From glyce with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, s1: str, s2: str, label: str = None, split_token: str = None) -> Instance:  # type: ignore
        tokens1 = self._tokenizer.tokenize(s1)
        tokens2 = self._tokenizer.tokenize(s2)
        # self._truncate_seq_pair(tokens1, tokens2, self.max_tokens)
        tokens1_field = TextField(tokens1, self._token_indexers)
        tokens2_field = TextField(tokens2, self._token_indexers)
        fields = {'premise': tokens1_field, 'hypothesis': tokens2_field}
        if label is not None:
            fields['label'] = LabelField(label)
        return Instance(fields) 
Example #18
Source File: list_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_nested_list_fields_are_padded_correctly(self):
        nested_field1 = ListField([LabelField(c) for c in [u'a', u'b', u'c', u'd', u'e']])
        nested_field2 = ListField([LabelField(c) for c in [u'f', u'g', u'h', u'i', u'j', u'k']])
        list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2])
        list_field.index(self.vocab)
        padding_lengths = list_field.get_padding_lengths()
        assert padding_lengths == {u'num_fields': 3, u'list_num_fields': 6}
        tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy()
        numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1],
                                                   [0, 1, 2, 3, 4, -1],
                                                   [5, 6, 7, 8, 9, 10]]) 
Example #19
Source File: bert_pretraining_reader.py    From kb with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,
                        sentence1: str,
                        sentence2: str,
                        next_sentence_label: int):

        fields = self._tokenizer_masker.tokenize_candidates_mask(sentence1, sentence2)

        # NSP label field
        fields['next_sentence_label'] = \
            LabelField(next_sentence_label, skip_indexing=True)

        return Instance(fields) 
Example #20
Source File: tacred_dataset_reader.py    From kb with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self,
                         sentence: Iterable[str],
                         relation: str,
                         subj_start: int,
                         subj_end: int,
                         obj_start: int,
                         obj_end: int) -> Instance:
        """
        Following approach in:
            https://openreview.net/forum?id=BJgrxbqp67
        We modify the input to look like:
            [CLS] subj [SEP] obj [SEP] sentence [SEP]
        """
        token_candidates = self.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sentence)

        if self.entity_masking == 'type/role/segment':
            offsets = [1] + token_candidates['offsets_a'][:-1]
            segment_ids = list(token_candidates['segment_ids'])
            for s, e, ii in [[subj_start, subj_end+1, 1], [obj_start, obj_end+1, 2]]:
                ll = offsets[e] - offsets[s]
                segment_ids[offsets[s]:offsets[e]] = [ii] * ll
            # the type + [SEP]
            segment_ids[-4:-2] = [1, 1]
            segment_ids[-2:] = [2, 2]
            token_candidates['segment_ids'] = segment_ids

        # get the indices of the entity starts
        offsets = [1] + token_candidates['offsets_a'][:-1]
        idx1_offset = offsets[subj_start]
        idx2_offset = offsets[obj_start]

        fields = self.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates)
        fields['label_ids'] = LabelField(LABEL_MAP[relation], skip_indexing=True)
        fields['index_a'] = LabelField(idx1_offset, skip_indexing=True)
        fields['index_b'] = LabelField(idx2_offset, skip_indexing=True)

        return Instance(fields) 
Example #21
Source File: ultra_fine_reader.py    From kb with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, sentence, span, labels, index_entity_start):
        token_candidates = self.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sentence, span)
        fields = self.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates)
        fields['label_ids'] = ArrayField(np.array(labels), dtype=np.int)

        # index of entity start
        if index_entity_start is not None:
            offsets = [1] + token_candidates['offsets_a'][:-1]
            idx1_offset = offsets[index_entity_start]
            fields['index_a'] = LabelField(idx1_offset, skip_indexing=True)

        return Instance(fields) 
Example #22
Source File: entity_linking.py    From kb with Apache License 2.0 5 votes vote down vote up
def _combine_instances(self, instance_a, instance_b, nsp_label, gold_cache):
        text_a = ' '.join([t.text for t in instance_a['tokens'].tokens])
        text_b = ' '.join([t.text for t in instance_b['tokens'].tokens])

        fields = self.tokenizer_and_masker.tokenize_candidates_mask(text_a, text_b)
        candidate_spans = [
            [s.span_start, s.span_end]
            for s in fields['candidates'].field_dict[self.id_type].field_dict['candidate_spans'].field_list
        ]
        assert sorted(candidate_spans) == candidate_spans

        # combine the gold entities
        golds = []
        for text in [text_a, text_b]:
            golds.append(gold_cache[text])

        combined_golds = []
        j = [-1, -1]
        for span in candidate_spans:
            i = fields['segment_ids'].array[span[0]]
            j[i] += 1
            combined_golds.append(golds[i][j[i]])

        gold_text_field = TextField(
            [Token(g) for g in combined_golds],
            token_indexers=self.entity_indexer
        )
        fields['gold_entities'] = DictField({self.id_type: gold_text_field})

        if self.use_nsp_label:
            fields['next_sentence_label'] = LabelField(nsp_label, skip_indexing=True)

        del fields['lm_label_ids']

        return Instance(fields) 
Example #23
Source File: cls_data.py    From glyce with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, sentence: str, label: str) -> Instance:
        sentence_tokens = self._tokenizer.tokenize(sentence)
        if len(sentence) > self.max_sentence_length:
            sentence_tokens = sentence_tokens[:self.max_sentence_length]
            self.trimmed_count += 1
        sentence_field = TextField(sentence_tokens, self._token_indexers)
        fields = {'sentence': sentence_field}
        if label is not None:
            fields['label'] = LabelField(label)
        return Instance(fields) 
Example #24
Source File: ir_triple_loader.py    From sigir19-neural-ir with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence)
        #doc_pos_tokenized.insert(0, Token(START_SYMBOL))
        #doc_pos_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length]

        doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers)

        doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence)
        #doc_neg_tokenized.insert(0, Token(START_SYMBOL))
        #doc_neg_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length]

        doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers)

        query_length = LabelField(len(query_tokenized), skip_indexing=True)
        doc_pos_length = LabelField(len(doc_pos_tokenized), skip_indexing=True)
        doc_neg_length = LabelField(len(doc_neg_tokenized), skip_indexing=True)

        return Instance({
            "query_tokens":query_field,
            "doc_pos_tokens":doc_pos_field,
            "doc_neg_tokens": doc_neg_field,
            "query_length":query_length,
            "doc_pos_length":doc_pos_length,
            "doc_neg_length":doc_neg_length}) 
Example #25
Source File: ir_labeled_tuple_loader.py    From sigir19-neural-ir with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = LabelField(int(query_id), skip_indexing=True)
        doc_id_field = LabelField(int(doc_id), skip_indexing=True)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        #if self._source_add_start_token:
        #    query_tokenized.insert(0, Token(START_SYMBOL))
        #query_tokenized.append(Token(END_SYMBOL))
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        #doc_tokenized.insert(0, Token(START_SYMBOL))
        #doc_tokenized.append(Token(END_SYMBOL))
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]

        doc_field = TextField(doc_tokenized, self._token_indexers)

        query_length = LabelField(len(query_tokenized), skip_indexing=True)
        doc_length = LabelField(len(doc_tokenized), skip_indexing=True)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field,
            "query_length":query_length,
            "doc_length":doc_length}) 
Example #26
Source File: imdb_review_reader.py    From topic-rnn with Apache License 2.0 5 votes vote down vote up
def _read(self, file_path):
        # A training instance consists of the word frequencies for the entire review and a
        # `words_per_instance`` portion of the review.
        file_path = cached_path(file_path)

        # Partition each review into BPTT Limit + 1 chunks to allow room for input (chunk[:-1])
        # and output (chunk[1:]).
        # Break up the text into a series of BPTT chunks and yield one at a time.
        num_tokens = self._words_per_instance + 1
        with open(cached_path(file_path), 'r') as data_file:
            logger.info("Reading instances from lines in file: %s", file_path)
            for line in data_file:
                line = line.strip("\n")
                if not line:
                    continue
                example = ujson.loads(line)
                example_text = example['text']
                example_text_tokenized = self._tokenizer.tokenize(example_text)
                example_sentiment = "positive" if example['sentiment'] >= 5 else "negative"
                example_sentiment_field = LabelField(example_sentiment)

                # Each review will receive the entire encoded review.
                frequency_field = TextField(example_text_tokenized, self._token_indexers)
                tokenized_strings = []
                for index in range(0, len(example_text_tokenized) - num_tokens, num_tokens - 1):
                    tokenized_strings.append(example_text_tokenized[index:(index + num_tokens)])

                    # By breaking early when training a classifier, we prevent training on duplicates.
                    if self._classification_mode:
                        break

                for tokenized_string in tokenized_strings:
                    input_field = TextField(tokenized_string[:-1], self._token_indexers)
                    output_field = TextField(tokenized_string[1:], self._token_indexers)
                    yield Instance({'input_tokens': input_field,
                                    'output_tokens': output_field,
                                    'frequency_tokens': frequency_field,
                                    'sentiment': example_sentiment_field}) 
Example #27
Source File: bert_reader_sent_selection.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         sent1: str,  # Important type information
                         sent2: str,
                         pid: str = None,
                         label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(sent1)
        tokenized_text2 = self.bert_tokenizer.tokenize(sent2)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.query_l]
        tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))]

        joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]']
        text1_len = len(tokenized_text1) + 2
        text2_len = len(tokenized_text2) + 1
        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2))

        fields['bert_s1_span'] = MetadataField(text1_span)
        fields['bert_s2_span'] = MetadataField(text2_span)

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields) 
Example #28
Source File: bert_fever_reader.py    From semanticRetrievalMRS with MIT License 5 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         sent1: str,  # Important type information
                         sent2: str,
                         pid: str = None,
                         label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        tokenized_text1 = self.bert_tokenizer.tokenize(sent1)
        tokenized_text2 = self.bert_tokenizer.tokenize(sent2)

        # _truncate_seq_pair(tokenized_text1, tokenized_text2, self.max_l)
        tokenized_text1 = tokenized_text1[:self.s1_l]
        tokenized_text2 = tokenized_text2[:(self.max_l - len(tokenized_text1))]

        joint_tokens_seq = ['[CLS]'] + tokenized_text1 + ['[SEP]'] + tokenized_text2 + ['[SEP]']
        text1_len = len(tokenized_text1) + 2
        text2_len = len(tokenized_text2) + 1
        segments_ids = [0 for _ in range(text1_len)] + [1 for _ in range(text2_len)]

        joint_tokens_ids = self.bert_tokenizer.convert_tokens_to_ids(joint_tokens_seq)
        assert len(joint_tokens_ids) == len(segments_ids)

        fields['paired_sequence'] = BertIndexField(np.asarray(joint_tokens_ids, dtype=np.int64))
        fields['paired_segments_ids'] = BertIndexField(np.asarray(segments_ids, dtype=np.int64))

        text1_span = (1, 1 + len(tokenized_text1)) # End is exclusive (important for later use)
        text2_span = (text1_span[1] + 1, text1_span[1] + 1 + len(tokenized_text2))

        fields['bert_s1_span'] = SpanField(text1_span[0], text1_span[1], fields['paired_sequence'])
        fields['bert_s2_span'] = SpanField(text2_span[0], text2_span[1], fields['paired_sequence'])

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields) 
Example #29
Source File: entailment_tuple_reader.py    From ARC-Solvers with Apache License 2.0 5 votes vote down vote up
def _add_structure_to_fields(self, structure, fields) -> None:
        """
        Add structure (nodes and edges) to the instance fields. Specifically, convert
        "plants<>produce<>oxygen" into ("produce", subj, "plants"), ("produce", obj, "oxygen"),
        ("plants", subj-obj, "oxygen"). Each quoted string forms a node represented using a
        TextField. Each source and target node in an edge is represented using IndexField into
        the list of nodes and the edge label is represented using a LabelField with "edges"
        namespace.
        """
        # take the last tuples
        tuples = structure.split("$$$")[-self._max_tuples:]
        node_list, edge_list = self._extract_nodes_and_edges_from_tuples(tuples)
        if not len(node_list):
            print("No nodes in {} for premise:{} and hypothesis: {}".format(
                structure, fields['metadata'].metadata["premise"],
                fields['metadata'].metadata["hypothesis"]))
        nodes_field = ListField(node_list)
        edge_source_list = []
        edge_target_list = []
        edge_label_list = []
        for edge in edge_list:
            source_field = IndexField(edge[0], nodes_field)
            target_field = IndexField(edge[2], nodes_field)
            label_field = LabelField(edge[1], "edges")
            edge_source_list.append(source_field)
            edge_target_list.append(target_field)
            edge_label_list.append(label_field)
        fields['nodes'] = nodes_field
        # Currently AllenNLP doesn't allow for ListFields containing ListFields,
        # so creating separate ListFields for source, target and labels for the edges
        fields['edge_sources'] = ListField(edge_source_list)
        fields['edge_targets'] = ListField(edge_target_list)
        fields['edge_labels'] = ListField(edge_label_list) 
Example #30
Source File: semisupervised_text_classification_json.py    From vampire with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, text: str, label: str = None) -> Instance:  # type: ignore
        """
        Parameters
        ----------
        text : ``str``, required.
            The text to classify
        label ``str``, optional, (default = None).
            The label for this text.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence or phrase.
            label : ``LabelField``
                The label label of the sentence or phrase.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        tokens = self._tokenizer.tokenize(text)
        if self._max_sequence_length is not None:
            tokens = self._truncate(tokens)
        fields['tokens'] = TextField(tokens, self._token_indexers)
        if label is not None:
            fields['label'] = LabelField(label,
                                         skip_indexing=self._skip_label_indexing)
        return Instance(fields)