Python allennlp.data.fields.ArrayField() Examples

The following are 29 code examples of allennlp.data.fields.ArrayField(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.data.fields , or try the search function .
Example #1
Source File: vampire_reader.py    From vampire with Apache License 2.0 6 votes vote down vote up
def text_to_instance(self, vec: str = None) -> Instance:  # type: ignore
        """
        Parameters
        ----------
        text : ``str``, required.
            The text to classify
        label ``str``, optional, (default = None).
            The label for this text.

        Returns
        -------
        An ``Instance`` containing the following fields:
            tokens : ``TextField``
                The tokens in the sentence or phrase.
            label : ``LabelField``
                The label label of the sentence or phrase.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        fields['tokens'] = ArrayField(vec)
        return Instance(fields) 
Example #2
Source File: array_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_padding_handles_list_fields(self):
        array1 = ArrayField(numpy.ones([2, 3]))
        array2 = ArrayField(numpy.ones([1, 5]))
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = (
            list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy()
        )
        correct_tensor = numpy.array(
            [
                [[1.0, 1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0, 0.0]],
                [[1.0, 1.0, 1.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0]],
                [[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0]],
            ]
        )
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor) 
Example #3
Source File: array_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_padding_handles_list_fields_with_padding_values(self):
        array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1)
        array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1)
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = (
            list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy()
        )
        correct_tensor = numpy.array(
            [
                [[1.0, 1.0, 1.0, -1.0, -1.0], [1.0, 1.0, 1.0, -1.0, -1.0]],
                [[1.0, 1.0, 1.0, 1.0, 1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]],
                [[-1.0, -1.0, -1.0, -1.0, -1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]],
            ]
        )
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor) 
Example #4
Source File: array_field_test.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def test_alternative_dtypes(self):
        shape = [3, 4, 5, 6]
        array = numpy.zeros(shape)

        # Setting dtype to numpy.int64 should produce a torch.LongTensor when field is converted to
        # a tensor
        array_field1 = ArrayField(array, dtype=numpy.int64)
        returned_tensor1 = array_field1.as_tensor(array_field1.get_padding_lengths())
        assert returned_tensor1.dtype == torch.int64

        # Setting dtype to numpy.uint8 should produce a torch.ByteTensor when field is converted to
        # a tensor
        array_field2 = ArrayField(array, dtype=numpy.uint8)
        returned_tensor2 = array_field2.as_tensor(array_field2.get_padding_lengths())
        assert returned_tensor2.dtype == torch.uint8

        # Padding should not affect dtype
        padding_lengths = {"dimension_" + str(i): 10 for i, _ in enumerate(shape)}
        padded_tensor = array_field2.as_tensor(padding_lengths)
        assert padded_tensor.dtype == torch.uint8

        # Empty fields should have the same dtype
        empty_field = array_field2.empty_field()
        assert empty_field.dtype == array_field2.dtype 
Example #5
Source File: array_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_padding_handles_list_fields(self):
        array1 = ArrayField(numpy.ones([2, 3]))
        array2 = ArrayField(numpy.ones([1, 5]))
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy()
        correct_tensor = numpy.array([[[1., 1., 1., 0., 0.],
                                       [1., 1., 1., 0., 0.]],
                                      [[1., 1., 1., 1., 1.],
                                       [0., 0., 0., 0., 0.]],
                                      [[0., 0., 0., 0., 0.],
                                       [0., 0., 0., 0., 0.]]])
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor) 
Example #6
Source File: test_dict_field.py    From kb with Apache License 2.0 5 votes vote down vote up
def test_list_field_of_dict_field(self):
        from allennlp.data import Instance
        from allennlp.data.iterators import BasicIterator

        tokens3 = "The long sentence .".split()
        tokens3_field = TextField(
            [Token(t) for t in tokens3],
            token_indexers={'tokens': SingleIdTokenIndexer()}
        )

        instance3_fields = {
            "candidate_entities": TextField(
                    [Token("entity1 entity2 entity3"), Token("entity_unk"), Token("entity2 entity3")],
                    token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array([[0.1, 0.1, 0.8],
                                                           [1.0, 0.0, 0.0],
                                                           [0.33, 0.67, 0.0]])),
            "candidate_spans": ListField(
                    [SpanField(1, 1, tokens3_field), SpanField(1, 2, tokens3_field), SpanField(1, 3, tokens3_field)],
            )
        }

        iterator = BasicIterator()
        iterator.index_with(self.vocab)

        instances = [Instance({"candidates": ListField([
                                    DictField(self.instance1_fields),
                                    DictField(self.instance2_fields)])}),
                     Instance({"candidates": ListField([
                                    DictField(self.instance1_fields),
                                    DictField(instance3_fields)])})
        ]

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            pass

        self.assertTrue(batch['candidates']['candidate_entities']['entity'].shape == batch['candidates']['candidate_entity_prior'].shape) 
Example #7
Source File: ultra_fine_reader.py    From kb with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, sentence, span, labels, index_entity_start):
        token_candidates = self.tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sentence, span)
        fields = self.tokenizer_and_candidate_generator.convert_tokens_candidates_to_fields(token_candidates)
        fields['label_ids'] = ArrayField(np.array(labels), dtype=np.int)

        # index of entity start
        if index_entity_start is not None:
            offsets = [1] + token_candidates['offsets_a'][:-1]
            idx1_offset = offsets[index_entity_start]
            fields['index_a'] = LabelField(idx1_offset, skip_indexing=True)

        return Instance(fields) 
Example #8
Source File: single_correct_mcq_entailment.py    From multee with Apache License 2.0 5 votes vote down vote up
def text_to_instance(self, # pylint: disable=arguments-differ
                         premises: List[str],
                         hypotheses: List[str],
                         answer_index: int = None,
                         relevant_sentence_idxs: List[int] = None) -> Instance:
        fields = {}
        premises_tokens = [self._tokenizer.tokenize(premise)[-self._premise_max_tokens:]
                           for premise in premises]
        hypotheses_tokens = [self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:]
                             for hypothesis in hypotheses]
        if premises:
            premises_text_fields = [TextField(premise_tokens, self._token_indexers)
                                    for premise_tokens in premises_tokens]
            premises_field = ListField(premises_text_fields)
        else:
            empty_stub = ListField([TextField([Token('dummy')], self._token_indexers)])
            premises_field = empty_stub.empty_field()
        fields['premises'] = premises_field

        hypotheses_text_fields = [TextField(hypothesis_tokens, self._token_indexers)
                                for hypothesis_tokens in hypotheses_tokens]
        hypotheses_field = ListField(hypotheses_text_fields)
        fields['hypotheses'] = hypotheses_field

        # If sentence relevance is available
        if relevant_sentence_idxs is not None:
            relevance_presence_mask = np.zeros(len(premises))
            for idx in relevant_sentence_idxs:
                relevance_presence_mask[idx] = 1
            fields['relevance_presence_mask'] = ArrayField(np.array(relevance_presence_mask))

        # If entailment labels are available
        if answer_index is not None:
            if answer_index not in range(0, len(hypotheses)):
                raise ConfigurationError("Provided label must be in 0 to {}".format(len(hypotheses)))
            fields['answer_index'] = ArrayField(np.array(answer_index), padding_value=-1, dtype=np.long)

        paragraph_tokens = [token for premise_tokens in premises_tokens for token in premise_tokens]
        paragraph_text_field = TextField(paragraph_tokens, self._token_indexers)
        fields['paragraph'] = paragraph_text_field
        return Instance(fields) 
Example #9
Source File: array_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_printing_doesnt_crash(self):
        array = ArrayField(numpy.ones([2, 3]), padding_value=-1)
        print(array) 
Example #10
Source File: array_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_padding_handles_list_fields_with_padding_values(self):
        array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1)
        array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1)
        empty_array = array1.empty_field()
        list_field = ListField([array1, array2, empty_array])

        returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy()
        correct_tensor = numpy.array([[[1., 1., 1., -1., -1.],
                                       [1., 1., 1., -1., -1.]],
                                      [[1., 1., 1., 1., 1.],
                                       [-1., -1., -1., -1., -1.]],
                                      [[-1., -1., -1., -1., -1.],
                                       [-1., -1., -1., -1., -1.]]])
        numpy.testing.assert_array_equal(returned_tensor, correct_tensor) 
Example #11
Source File: dataset_reader.py    From ConvLab with MIT License 5 votes vote down vote up
def text_to_instance(self, state: np.ndarray, action: int = None) -> Instance:  # type: ignore
        """
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        fields["states"] = ArrayField(state)
        if action is not None:
            fields["actions"] = LabelField(action, skip_indexing=True)
        return Instance(fields) 
Example #12
Source File: array_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_as_tensor_handles_larger_padding_dimensions(self):
        shape = [3, 4]
        array = numpy.ones(shape)
        array_field = ArrayField(array)

        padded_tensor = array_field.as_tensor({u"dimension_0": 5, u"dimension_1": 6}).detach().cpu().numpy()
        numpy.testing.assert_array_equal(padded_tensor[:3, :4], array)
        numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.) 
Example #13
Source File: array_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def test_get_padding_lengths_correctly_returns_ordered_shape(self):
        shape = [3, 4, 5, 6]
        array = numpy.zeros(shape)
        array_field = ArrayField(array)
        lengths = array_field.get_padding_lengths()
        for i in range(len(lengths)):
            assert lengths[u"dimension_{}".format(i)] == shape[i] 
Example #14
Source File: array_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_eq(self):
        array1 = ArrayField(numpy.asarray([1, 1, 1]))
        array2 = ArrayField(numpy.asarray([[1, 1, 1], [1, 1, 1]]))
        array3 = ArrayField(numpy.asarray([1, 1, 2]))
        array4 = ArrayField(numpy.asarray([1, 1, 1]))
        assert array1 != array2
        assert array1 != array3
        assert array1 == array4 
Example #15
Source File: array_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_len_works_with_scalar(self):
        array = ArrayField(numpy.asarray(42))
        assert len(array) == 1 
Example #16
Source File: array_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_as_tensor_with_scalar_keeps_dtype(self):
        array = ArrayField(numpy.asarray(42, dtype=numpy.float32))
        returned_tensor = array.as_tensor(array.get_padding_lengths())
        assert returned_tensor.dtype == torch.float32 
Example #17
Source File: array_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_printing_doesnt_crash(self):
        array = ArrayField(numpy.ones([2, 3]), padding_value=-1)
        print(array) 
Example #18
Source File: array_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_as_tensor_handles_larger_padding_dimensions(self):
        shape = [3, 4]
        array = numpy.ones(shape)
        array_field = ArrayField(array)

        padded_tensor = (
            array_field.as_tensor({"dimension_0": 5, "dimension_1": 6}).detach().cpu().numpy()
        )
        numpy.testing.assert_array_equal(padded_tensor[:3, :4], array)
        numpy.testing.assert_array_equal(padded_tensor[3:, 4:], 0.0) 
Example #19
Source File: array_field_test.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def test_get_padding_lengths_correctly_returns_ordered_shape(self):
        shape = [3, 4, 5, 6]
        array = numpy.zeros(shape)
        array_field = ArrayField(array)
        lengths = array_field.get_padding_lengths()
        for i in range(len(lengths)):
            assert lengths["dimension_{}".format(i)] == shape[i] 
Example #20
Source File: arc_multichoice_with_facts_text_json_reader_multi_source.py    From OpenBookQA with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         item_id: Any,
                         question_text: str,
                         choice_text_list: List[str],
                         facts_text_list: List[str],
                         question2facts_mapping: List[float],
                         choice2facts_mapping: List[List[float]],
                         answer_id: int,
                         meta_fields: Dict = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}

        question_tokens = self.tokenize(question_text, "question")
        choices_tokens_list = [self.tokenize(x, "choice") for x in choice_text_list]
        facts_tokens_list = [self.tokenize(x, "fact") for x in facts_text_list]

        fields['question'] = TextField(question_tokens, self._token_indexers)
        fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list])
        fields['facts_list'] = ListField([TextField(x, self._token_indexers) for x in facts_tokens_list])
        fields['question2facts_map'] = ArrayField(np.asarray(question2facts_mapping))
        fields['choice2facts_map'] = ArrayField(np.asarray(choice2facts_mapping))

        fields['label'] = LabelField(answer_id, skip_indexing=True)

        metadata = {
            "id": item_id,
            "question_text": question_text,
            "choice_text_list": choice_text_list,
            "facts_text_list": facts_text_list,
            "question_tokens": [x.text for x in question_tokens],
            "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list],
            "facts_tokens_list": [[x.text for x in ct] for ct in facts_tokens_list],
            "label_gold": answer_id,
        }

        if meta_fields is not None:
            for k, v in meta_fields.items():
                metadata[k] = v

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields) 
Example #21
Source File: summarization_reader.py    From summarus with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self, source: str, target: str = None) -> Instance:
        def prepare_text(text, max_tokens):
            text = text.lower() if self._lowercase else text
            tokens = self._tokenizer.tokenize(text)[:max_tokens]
            tokens.insert(0, Token(START_SYMBOL))
            tokens.append(Token(END_SYMBOL))
            return tokens

        source_tokens = prepare_text(source, self._source_max_tokens)
        source_tokens_indexed = TextField(source_tokens, self._source_token_indexers)
        result = {'source_tokens': source_tokens_indexed}
        meta_fields = {}

        if self._save_copy_fields:
            source_to_target_field = NamespaceSwappingField(source_tokens[1:-1], self._target_namespace)
            result["source_to_target"] = source_to_target_field
            meta_fields["source_tokens"] = [x.text for x in source_tokens[1:-1]]

        if self._save_pgn_fields:
            source_to_target_field = NamespaceSwappingField(source_tokens, self._target_namespace)
            result["source_to_target"] = source_to_target_field
            meta_fields["source_tokens"] = [x.text for x in source_tokens]

        if target:
            target_tokens = prepare_text(target, self._target_max_tokens)
            target_tokens_indexed = TextField(target_tokens, self._target_token_indexers)
            result['target_tokens'] = target_tokens_indexed

            if self._save_pgn_fields:
                meta_fields["target_tokens"] = [y.text for y in target_tokens]
                source_and_target_token_ids = self._tokens_to_ids(source_tokens + target_tokens, self._lowercase)
                source_token_ids = source_and_target_token_ids[:len(source_tokens)]
                result["source_token_ids"] = ArrayField(np.array(source_token_ids, dtype='long'))
                target_token_ids = source_and_target_token_ids[len(source_tokens):]
                result["target_token_ids"] = ArrayField(np.array(target_token_ids, dtype='long'))

            if self._save_copy_fields:
                meta_fields["target_tokens"] = [y.text for y in target_tokens[1:-1]]
                source_and_target_token_ids = self._tokens_to_ids(source_tokens[1:-1] + target_tokens, self._lowercase)
                source_token_ids = source_and_target_token_ids[:len(source_tokens)-2]
                result["source_token_ids"] = ArrayField(np.array(source_token_ids))
                target_token_ids = source_and_target_token_ids[len(source_tokens)-2:]
                result["target_token_ids"] = ArrayField(np.array(target_token_ids))

        elif self._save_copy_fields:
            source_token_ids = self._tokens_to_ids(source_tokens[1:-1], self._lowercase)
            result["source_token_ids"] = ArrayField(np.array(source_token_ids))
        elif self._save_pgn_fields:
            source_token_ids = self._tokens_to_ids(source_tokens, self._lowercase)
            result["source_token_ids"] = ArrayField(np.array(source_token_ids))
        if self._save_copy_fields or self._save_pgn_fields:
            result["metadata"] = MetadataField(meta_fields)
        return Instance(result) 
Example #22
Source File: citation_data_reader_aclarc.py    From scicite with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         citing_paper_title: str = None,
                         cited_paper_title: str = None,
                         citing_paper_year: int = None,
                         cited_paper_year: int = None,
                         citing_author_ids: List[str] = None,
                         cited_author_ids: List[str] = None,
                         extended_context: str = None,
                         section_number: int = None,
                         section_title: str = None,
                         sents_before: List[str] = None,
                         sents_after: List[str] = None,
                         cite_marker_begin: int = None,
                         cite_marker_end: int = None,
                         cleaned_cite_text: str = None,
                         citation_excerpt_index: str = None,
                         citation_id: str = None,
                         venue: str = None) -> Instance:  # type: ignore

        citation_tokens = self._tokenizer.tokenize(citation_text)
        # tok_cited_title = self._tokenizer.tokenize(cited_paper_title)
        # tok_citing_title = self._tokenizer.tokenize(citing_paper_title)
        # tok_extended_context = self._tokenizer.tokenize(extended_context)

        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if self.use_sparse_lexicon_features:
            # convert to regular string
            sent = [token.text.lower() for token in citation_tokens]
            lexicon_features, _ = is_in_lexicon(self.lexicons, sent)
            fields["lexicon_features"] = ListField([LabelField(feature, skip_indexing=True)
                                                    for feature in lexicon_features])

        if intent is not None:
            fields['labels'] = LabelField(intent)

        if citing_paper_year and cited_paper_year and \
                citing_paper_year > -1 and cited_paper_year > -1:
            year_diff = citing_paper_year - cited_paper_year
        else:
            year_diff = -1
        fields['year_diff'] = ArrayField(torch.Tensor([year_diff]))
        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        fields['citation_excerpt_index'] = MetadataField(citation_excerpt_index)
        fields['citation_id'] = MetadataField(citation_id)
        return Instance(fields) 
Example #23
Source File: citation_data_reader_scicite.py    From scicite with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,
                         citation_text: str,
                         citing_paper_id: str,
                         cited_paper_id: str,
                         intent: List[str] = None,
                         citing_paper_title: str = None,
                         cited_paper_title: str = None,
                         citing_paper_year: int = None,
                         cited_paper_year: int = None,
                         citing_author_ids: List[str] = None,
                         cited_author_ids: List[str] = None,
                         extended_context: str = None,
                         section_number: int = None,
                         section_title: str = None,
                         cite_marker_begin: int = None,
                         cite_marker_end: int = None,
                         sents_before: List[str] = None,
                         sents_after: List[str] = None,
                         cleaned_cite_text: str = None,
                         citation_excerpt_index: str = None,
                         venue: str = None) -> Instance:  # type: ignore

        citation_tokens = self._tokenizer.tokenize(citation_text)

        fields = {
            'citation_text': TextField(citation_tokens, self._token_indexers),
        }

        if self.use_sparse_lexicon_features:
            # convert to regular string
            sent = [token.text.lower() for token in citation_tokens]
            lexicon_features, _ = is_in_lexicon(self.lexicons, sent)
            fields["lexicon_features"] = ListField([LabelField(feature, skip_indexing=True)
                                                    for feature in lexicon_features])

        if intent:
            if self.multilabel:
                fields['labels'] = MultiLabelField([S2_CATEGORIES[e] for e in intent], skip_indexing=True,
                                                   num_labels=len(S2_CATEGORIES))
            else:
                if not isinstance(intent, str):
                    raise TypeError(f"Undefined label format. Should be a string. Got: f'{intent}'")
                fields['labels'] = LabelField(intent)

        if citing_paper_year and cited_paper_year and \
                citing_paper_year > -1 and cited_paper_year > -1:
            year_diff = citing_paper_year - cited_paper_year
        else:
            year_diff = -1
        fields['year_diff'] = ArrayField(torch.Tensor([year_diff]))
        fields['citing_paper_id'] = MetadataField(citing_paper_id)
        fields['cited_paper_id'] = MetadataField(cited_paper_id)
        fields['citation_excerpt_index'] = MetadataField(citation_excerpt_index)
        fields['citation_id'] = MetadataField(f"{citing_paper_id}>{cited_paper_id}")
        return Instance(fields) 
Example #24
Source File: bert_tokenizer_and_candidate_generator.py    From kb with Apache License 2.0 4 votes vote down vote up
def convert_tokens_candidates_to_fields(self, tokens_and_candidates):
        """
        tokens_and_candidates is the return from a previous call to
        generate_sentence_entity_candidates.  Converts the dict to
        a dict of fields usable with allennlp.
        """
        fields = {}

        fields['tokens'] = TextField(
                [Token(t, text_id=self.bert_tokenizer.vocab[t])
                    for t in tokens_and_candidates['tokens']],
                token_indexers=self._bert_single_id_indexer
        )

        fields['segment_ids'] = ArrayField(
            np.array(tokens_and_candidates['segment_ids']), dtype=np.int
        )

        all_candidates = {}
        for key, entity_candidates in tokens_and_candidates['candidates'].items():
            # pad the prior to create the array field
            # make a copy to avoid modifying the input
            candidate_entity_prior = copy.deepcopy(
                    entity_candidates['candidate_entity_priors']
            )
            max_cands = max(len(p) for p in candidate_entity_prior)
            for p in candidate_entity_prior:
                if len(p) < max_cands:
                    p.extend([0.0] * (max_cands - len(p)))
            np_prior = np.array(candidate_entity_prior)

            candidate_fields = {
                "candidate_entity_priors": ArrayField(np_prior, dtype=self.dtype),
                "candidate_entities": TextField(
                    [Token(" ".join(candidate_list)) for candidate_list in entity_candidates["candidate_entities"]],
                    token_indexers={'ids': self._entity_indexers[key]}),
                "candidate_spans": ListField(
                    [SpanField(span[0], span[1], fields['tokens']) for span in
                    entity_candidates['candidate_spans']]
                ),
                "candidate_segment_ids": ArrayField(
                    np.array(entity_candidates['candidate_segment_ids']), dtype=np.int
        )
            }
            all_candidates[key] = DictField(candidate_fields)

        fields["candidates"] = DictField(all_candidates)

        return fields 
Example #25
Source File: wordnet.py    From kb with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,
                         tokens: List[str],
                         candidate_entities: List[List[str]],
                         candidate_spans: List[List[int]],
                         candidate_entity_prior: List[List[float]],
                         gold_entities: List[str] = None,
                         gold_data_ids: List[str] = None):

        # prior needs to be 2D and full
        # can look like [[0.2, 0.8], [1.0]]  if one candidate for second
        # candidate span and two candidates for first
        max_cands = max(len(p) for p in candidate_entity_prior)
        for p in candidate_entity_prior:
            if len(p) < max_cands:
                p.extend([0.0] * (max_cands - len(p)))
        np_prior = np.array(candidate_entity_prior)

        fields = {
            "tokens": TextField([Token(t) for t in tokens],
                      token_indexers=self.token_indexers),

            # join by space, then retokenize in the "character indexer"
            "candidate_entities": TextField(
                [Token(" ".join(candidate_list)) for candidate_list in candidate_entities],
                token_indexers=self.entity_indexer),
            "candidate_entity_prior": ArrayField(np.array(np_prior)),
            # only one sentence
            "candidate_segment_ids": ArrayField(
                np.array([0] * len(candidate_entities)), dtype=np.int
            )
        }

        if gold_entities is not None:
            fields["gold_entities"] =  TextField([Token(entity) for entity in gold_entities],
                                                  token_indexers=self.entity_indexer)
        if gold_data_ids is not None:
            fields["gold_data_ids"] = MetadataField(gold_data_ids)

        span_fields = []
        for span in candidate_spans:
            span_fields.append(SpanField(span[0], span[1], fields['tokens']))
        fields['candidate_spans'] = ListField(span_fields)

        if self.extra_candidate_generators:
            tokens = " ".join(tokens)
            extra_candidates = {
                    key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True)
                    for key, generator in self.extra_candidate_generators.items()
            }
            fields['extra_candidates'] = MetadataField(extra_candidates)

        return Instance(fields, should_remap_span_indices=self.should_remap_span_indices) 
Example #26
Source File: kg_probe_reader.py    From kb with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self, sentence: str, span: Tuple[int, ...]):
        token_candidates = self._tokenizer_and_candidate_generator.tokenize_and_generate_candidates(sentence)

        # NOTE: Skipping the padding here since sentences are all quite short.
        vocab = self._tokenizer_and_candidate_generator.bert_tokenizer.vocab
        lm_label_ids = TextField(
            [Token(t, text_id=vocab[t]) for t in token_candidates['tokens']],
            token_indexers=self._label_indexer
        )

        # We need to offset the start and end of the span so that it aligns with word pieces.
        if span[0] == 0:
            start = 1  # Since 0'th elt. is <CLS>
        else:
            start = token_candidates['offsets_a'][span[0] - 1]
        end = token_candidates['offsets_a'][span[1]]

        masked_tokens: List[str] = token_candidates['tokens'].copy()
        mask_indicator = np.zeros(len(masked_tokens), dtype=np.uint8)
        for i in range(start, end):
            masked_tokens[i] = '[MASK]'
            mask_indicator[i] = 1

        token_candidates['tokens'] = masked_tokens

        # mask out the entity candidates
        candidates = token_candidates['candidates']
        for candidate_key in candidates.keys():
            indices_to_mask = []
            for k, candidate_span in enumerate(candidates[candidate_key]['candidate_spans']):
                # (end-1) as candidate spans are exclusive (e.g. candidate_span = (0, 0) has start=0, end=1)
                if (candidate_span[0] >= start and candidate_span[0] <= end-1) or (
                    candidate_span[1] >= start and candidate_span[1] <= end-1):
                    indices_to_mask.append(k)
            for ind in indices_to_mask:
                candidates[candidate_key]['candidate_entities'][ind] = ['@@MASK@@']
                candidates[candidate_key]['candidate_entity_priors'][ind] = [1.0]

        fields = self._tokenizer_and_candidate_generator. \
            convert_tokens_candidates_to_fields(token_candidates)

        fields['lm_label_ids'] = lm_label_ids
        fields['mask_indicator'] = ArrayField(mask_indicator, dtype=np.uint8)

        return Instance(fields) 
Example #27
Source File: wiki_linking_reader.py    From kb with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self,
                         tokenized_text: List[str],
                         candidate_entities: List[List[str]],
                         candidate_spans: List[List[int]],
                         candidate_entity_prior: List[List[float]],
                         gold_entities: List[str] = None,
                         doc_id: str = None):

        assert doc_id is not None

        token_field = TextField([Token(x) for x in tokenized_text], self.token_indexers)
        span_fields = ListField([SpanField(*span, token_field) for span in candidate_spans])

        candidate_entities = TextField(
                [Token(" ".join(candidate_list)) for candidate_list in candidate_entities],
                token_indexers=self.entity_indexer)

        max_cands = max(len(p) for p in candidate_entity_prior)
        for p in candidate_entity_prior:
            if len(p) < max_cands:
                p.extend([0.0] * (max_cands - len(p)))
        np_prior = np.array(candidate_entity_prior)
        prior_field = ArrayField(np_prior)

        # only one segment
        candidate_segment_ids = ArrayField(
                np.array([0] * len(candidate_entities)), dtype=np.int
        )

        fields = {
            "tokens": token_field,
            "candidate_spans": span_fields,
            "candidate_entities": candidate_entities,
            "candidate_entity_prior": prior_field,
            "candidate_segment_ids": candidate_segment_ids
            }
        if gold_entities:
            labels = TextField([Token(entity) for entity in gold_entities],
                               token_indexers=self.entity_indexer)
            fields["gold_entities"] = labels

        fields["doc_id"] = MetadataField(doc_id)

        if self.extra_candidate_generators:
            tokens = " ".join(tokenized_text)
            extra_candidates = {
                    key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True)
                    for key, generator in self.extra_candidate_generators.items()
            }
            fields['extra_candidates'] = MetadataField(extra_candidates)

        return Instance(fields, should_remap_span_indices=self.should_remap_span_indices) 
Example #28
Source File: multiple_correct_mcq_entailment.py    From multee with Apache License 2.0 4 votes vote down vote up
def text_to_instance(self, # pylint: disable=arguments-differ
                         premises: List[str],
                         hypotheses: List[str],
                         answer_indices: List[int] = None,
                         relevant_sentence_idxs: List[int] = None) -> Instance:
        fields = {}
        premises_tokens = [self._tokenizer.tokenize(premise)[-self._premise_max_tokens:]
                           for premise in premises]
        hypotheses_tokens = [self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:]
                             for hypothesis in hypotheses]
        if premises:
            premises_text_fields = [TextField(premise_tokens, self._token_indexers)
                                    for premise_tokens in premises_tokens]
            premises_field = ListField(premises_text_fields)
        else:
            empty_stub = ListField([TextField([Token('dummy')], self._token_indexers)])
            premises_field = empty_stub.empty_field()
        fields['premises'] = premises_field

        hypotheses_text_fields = [TextField(hypothesis_tokens, self._token_indexers)
                                for hypothesis_tokens in hypotheses_tokens]
        hypotheses_field = ListField(hypotheses_text_fields)
        fields['hypotheses'] = hypotheses_field

        # If sentence relevance is available
        if relevant_sentence_idxs is not None:
            relevance_presence_mask = np.zeros(len(premises))
            for idx in relevant_sentence_idxs:
                relevance_presence_mask[idx] = 1
            fields['relevance_presence_mask'] = ArrayField(np.array(relevance_presence_mask))

        # If answer_indices labels are available
        if answer_indices is not None:
            answer_correctness_mask = np.zeros(len(hypotheses))
            for answer_index in answer_indices:
                answer_correctness_mask[answer_index] = 1
            fields['answer_correctness_mask'] = ArrayField(answer_correctness_mask, padding_value=-1, dtype=np.long)

        paragraph_tokens = [token for premise_tokens in premises_tokens for token in premise_tokens]
        paragraph_text_field = TextField(paragraph_tokens, self._token_indexers)
        fields['paragraph'] = paragraph_text_field
        return Instance(fields) 
Example #29
Source File: fever_reader_with_wn.py    From combine-FEVER-NSMN with MIT License 4 votes vote down vote up
def text_to_instance(self,  # type: ignore
                         premise: str,
                         hypothesis: str,
                         pid: str = None,
                         label: str = None) -> Instance:

        fields: Dict[str, Field] = {}

        premise_tokens = [Token(t) for t in premise.split(' ')]  # Removing code for parentheses in NLI
        hypothesis_tokens = [Token(t) for t in hypothesis.split(' ')]

        if self.max_l is not None:
            premise_tokens = premise_tokens[:self.max_l]
            hypothesis_tokens = hypothesis_tokens[:self.max_l]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)

        # WN feature dict:
        premise_s = premise.split(' ')
        hypothesis_s = hypothesis.split(' ')

        if self.max_l is not None:
            premise_s = premise_s[:self.max_l]
            hypothesis_s = hypothesis_s[:self.max_l]

        example_feature = wn_persistent_api.compute_wn_features_p_accerate(premise_s,
                                                                           hypothesis_s,
                                                                           self.wn_p_dict)

        p_wn_nparray, h_wn_nparray = wn_persistent_api.wn_raw_feature_to_nparray(
            example_feature,
            self.wn_feature_list)

        assert len(premise_tokens) == p_wn_nparray.shape[0]
        assert len(hypothesis_tokens) == h_wn_nparray.shape[0]

        fields['p_wn_feature'] = ArrayField(p_wn_nparray)
        fields['h_wn_feature'] = ArrayField(h_wn_nparray)

        if label:
            fields['label'] = LabelField(label, label_namespace='labels')

        if pid:
            fields['pid'] = IdField(pid)

        return Instance(fields)