Python allennlp.data.fields.ListField() Examples
The following are 30
code examples of allennlp.data.fields.ListField().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.data.fields
, or try the search function
.
Example #1
Source File: arc_multichoice_json_reader.py From ARC-Solvers with Apache License 2.0 | 6 votes |
def text_to_instance(self, # type: ignore item_id: Any, question_text: str, choice_text_list: List[str], answer_id: int) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self._tokenizer.tokenize(question_text) choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list] fields['question'] = TextField(question_tokens, self._token_indexers) fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list]) fields['label'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "choice_text_list": choice_text_list, "question_tokens": [x.text for x in question_tokens], "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list], } fields["metadata"] = MetadataField(metadata) return Instance(fields)
Example #2
Source File: array_field_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_padding_handles_list_fields_with_padding_values(self): array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1) array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = ( list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() ) correct_tensor = numpy.array( [ [[1.0, 1.0, 1.0, -1.0, -1.0], [1.0, 1.0, 1.0, -1.0, -1.0]], [[1.0, 1.0, 1.0, 1.0, 1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]], [[-1.0, -1.0, -1.0, -1.0, -1.0], [-1.0, -1.0, -1.0, -1.0, -1.0]], ] ) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Example #3
Source File: list_field_test.py From magnitude with MIT License | 6 votes |
def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths[u"list_num_tokens"] = 7 padding_lengths[u"num_fields"] = 5 tensor_dict = list_field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][3].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0])) numpy.testing.assert_array_almost_equal(tensor_dict[u"words"][4].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0]))
Example #4
Source File: summarization_sentence_tagger_reader.py From summarus with Apache License 2.0 | 6 votes |
def text_to_instance(self, text: str, sentences: List[str] = None, tags: List[int] = None) -> Instance: if sentences is None: if self._language == "ru": sentences = [s.text for s in razdel.sentenize(text)] else: sentences = nltk.tokenize.sent_tokenize(text) sentences_tokens = [] for sentence in sentences[:self._max_sentences_count]: sentence = sentence.lower() if self._lowercase else sentence tokens = self._tokenizer.tokenize(sentence)[:self._sentence_max_tokens] tokens.insert(0, Token(START_SYMBOL)) tokens.append(Token(END_SYMBOL)) indexed_tokens = TextField(tokens, self._source_token_indexers) sentences_tokens.append(indexed_tokens) sentences_tokens_indexed = ListField(sentences_tokens) result = {'source_sentences': sentences_tokens_indexed} if tags: result["sentences_tags"] = SequenceLabelField(tags[:self._max_sentences_count], sentences_tokens_indexed) return Instance(result)
Example #5
Source File: test_dict_field.py From kb with Apache License 2.0 | 5 votes |
def test_list_field_of_dict_field(self): from allennlp.data import Instance from allennlp.data.iterators import BasicIterator tokens3 = "The long sentence .".split() tokens3_field = TextField( [Token(t) for t in tokens3], token_indexers={'tokens': SingleIdTokenIndexer()} ) instance3_fields = { "candidate_entities": TextField( [Token("entity1 entity2 entity3"), Token("entity_unk"), Token("entity2 entity3")], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array([[0.1, 0.1, 0.8], [1.0, 0.0, 0.0], [0.33, 0.67, 0.0]])), "candidate_spans": ListField( [SpanField(1, 1, tokens3_field), SpanField(1, 2, tokens3_field), SpanField(1, 3, tokens3_field)], ) } iterator = BasicIterator() iterator.index_with(self.vocab) instances = [Instance({"candidates": ListField([ DictField(self.instance1_fields), DictField(self.instance2_fields)])}), Instance({"candidates": ListField([ DictField(self.instance1_fields), DictField(instance3_fields)])}) ] for batch in iterator(instances, num_epochs=1, shuffle=False): pass self.assertTrue(batch['candidates']['candidate_entities']['entity'].shape == batch['candidates']['candidate_entity_prior'].shape)
Example #6
Source File: single_correct_mcq_entailment.py From multee with Apache License 2.0 | 5 votes |
def text_to_instance(self, # pylint: disable=arguments-differ premises: List[str], hypotheses: List[str], answer_index: int = None, relevant_sentence_idxs: List[int] = None) -> Instance: fields = {} premises_tokens = [self._tokenizer.tokenize(premise)[-self._premise_max_tokens:] for premise in premises] hypotheses_tokens = [self._tokenizer.tokenize(hypothesis)[-self._hypothesis_max_tokens:] for hypothesis in hypotheses] if premises: premises_text_fields = [TextField(premise_tokens, self._token_indexers) for premise_tokens in premises_tokens] premises_field = ListField(premises_text_fields) else: empty_stub = ListField([TextField([Token('dummy')], self._token_indexers)]) premises_field = empty_stub.empty_field() fields['premises'] = premises_field hypotheses_text_fields = [TextField(hypothesis_tokens, self._token_indexers) for hypothesis_tokens in hypotheses_tokens] hypotheses_field = ListField(hypotheses_text_fields) fields['hypotheses'] = hypotheses_field # If sentence relevance is available if relevant_sentence_idxs is not None: relevance_presence_mask = np.zeros(len(premises)) for idx in relevant_sentence_idxs: relevance_presence_mask[idx] = 1 fields['relevance_presence_mask'] = ArrayField(np.array(relevance_presence_mask)) # If entailment labels are available if answer_index is not None: if answer_index not in range(0, len(hypotheses)): raise ConfigurationError("Provided label must be in 0 to {}".format(len(hypotheses))) fields['answer_index'] = ArrayField(np.array(answer_index), padding_value=-1, dtype=np.long) paragraph_tokens = [token for premise_tokens in premises_tokens for token in premise_tokens] paragraph_text_field = TextField(paragraph_tokens, self._token_indexers) fields['paragraph'] = paragraph_text_field return Instance(fields)
Example #7
Source File: array_field_test.py From magnitude with MIT License | 5 votes |
def test_padding_handles_list_fields_with_padding_values(self): array1 = ArrayField(numpy.ones([2, 3]), padding_value=-1) array2 = ArrayField(numpy.ones([1, 5]), padding_value=-1) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() correct_tensor = numpy.array([[[1., 1., 1., -1., -1.], [1., 1., 1., -1., -1.]], [[1., 1., 1., 1., 1.], [-1., -1., -1., -1., -1.]], [[-1., -1., -1., -1., -1.], [-1., -1., -1., -1., -1.]]]) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Example #8
Source File: array_field_test.py From magnitude with MIT License | 5 votes |
def test_padding_handles_list_fields(self): array1 = ArrayField(numpy.ones([2, 3])) array2 = ArrayField(numpy.ones([1, 5])) empty_array = array1.empty_field() list_field = ListField([array1, array2, empty_array]) returned_tensor = list_field.as_tensor(list_field.get_padding_lengths()).detach().cpu().numpy() correct_tensor = numpy.array([[[1., 1., 1., 0., 0.], [1., 1., 1., 0., 0.]], [[1., 1., 1., 1., 1.], [0., 0., 0., 0., 0.]], [[0., 0., 0., 0., 0.], [0., 0., 0., 0., 0.]]]) numpy.testing.assert_array_equal(returned_tensor, correct_tensor)
Example #9
Source File: list_field_test.py From magnitude with MIT License | 5 votes |
def test_printing_doesnt_crash(self): list_field = ListField([self.field1, self.field2]) print(list_field)
Example #10
Source File: list_field_test.py From magnitude with MIT License | 5 votes |
def test_as_tensor_can_handle_multiple_token_indexers_and_empty_fields(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1.empty_field(), self.field1, self.field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict[u"words"].detach().cpu().numpy() characters = tensor_dict[u"characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[0, 0, 0, 0, 0], [2, 3, 4, 5, 0], [2, 3, 4, 1, 5]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.zeros([5, 9])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]]))
Example #11
Source File: list_field_test.py From magnitude with MIT License | 5 votes |
def test_as_tensor_can_handle_multiple_token_indexers(self): # pylint: disable=protected-access self.field1._token_indexers = self.words_and_characters_indexers self.field2._token_indexers = self.words_and_characters_indexers self.field3._token_indexers = self.words_and_characters_indexers list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() tensor_dict = list_field.as_tensor(padding_lengths) words = tensor_dict[u"words"].detach().cpu().numpy() characters = tensor_dict[u"characters"].detach().cpu().numpy() numpy.testing.assert_array_almost_equal(words, numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [2, 3, 1, 5, 0]])) numpy.testing.assert_array_almost_equal(characters[0], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]])) numpy.testing.assert_array_almost_equal(characters[1], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 3, 1, 3, 4, 5], [2, 3, 4, 5, 3, 4, 6, 3, 0]])) numpy.testing.assert_array_almost_equal(characters[2], numpy.array([[5, 1, 1, 2, 0, 0, 0, 0, 0], [1, 2, 0, 0, 0, 0, 0, 0, 0], [1, 4, 1, 5, 1, 3, 1, 0, 0], [2, 3, 4, 5, 3, 4, 6, 3, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0]]))
Example #12
Source File: list_field_test.py From magnitude with MIT License | 5 votes |
def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in [u'a', u'b', u'c', u'd', u'e']]) nested_field2 = ListField([LabelField(c) for c in [u'f', u'g', u'h', u'i', u'j', u'k']]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {u'num_fields': 3, u'list_num_fields': 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal(tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]])
Example #13
Source File: arc_multichoice_json_reader.py From OpenBookQA with Apache License 2.0 | 5 votes |
def text_to_instance(self, # type: ignore item_id: Any, question_text: str, choice_text_list: List[str], answer_id: int ) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} question_tokens = self._tokenizer.tokenize(question_text) choices_tokens_list = [self._tokenizer.tokenize(x) for x in choice_text_list] fields['question'] = TextField(question_tokens, self._token_indexers) fields['choices_list'] = ListField([TextField(x, self._token_indexers) for x in choices_tokens_list]) fields['label'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question_text, "choice_text_list": choice_text_list, "question_tokens": [x.text for x in question_tokens], "choice_tokens_list": [[x.text for x in ct] for ct in choices_tokens_list], } fields["metadata"] = MetadataField(metadata) return Instance(fields)
Example #14
Source File: list_field_test.py From magnitude with MIT License | 5 votes |
def test_list_field_can_handle_empty_index_fields(self): list_field = ListField([self.index_field, self.index_field, self.empty_index_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]]))
Example #15
Source File: list_field_test.py From magnitude with MIT License | 5 votes |
def test_list_field_can_handle_empty_text_fields(self): list_field = ListField([self.field1, self.field2, self.empty_text_field]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor_dict[u"words"].detach().cpu().numpy(), numpy.array([[2, 3, 4, 5, 0], [2, 3, 4, 1, 5], [0, 0, 0, 0, 0]]))
Example #16
Source File: list_field_test.py From magnitude with MIT License | 5 votes |
def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {u"num_fields": 3, u"list_num_tokens": 5}
Example #17
Source File: list_field_test.py From magnitude with MIT License | 5 votes |
def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField([self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal(tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]]))
Example #18
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_empty_list_can_be_tensorized(self): tokenizer = SpacyTokenizer() tokens = tokenizer.tokenize("Foo") text_field = TextField(tokens, self.word_indexer) list_field = ListField([text_field.empty_field()]) fields = { "list": list_field, "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer), } instance = Instance(fields) instance.index_fields(self.vocab) instance.as_tensor_dict()
Example #19
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_sequence_methods(self): list_field = ListField([self.field1, self.field2, self.field3]) assert len(list_field) == 3 assert list_field[1] == self.field2 assert [f for f in list_field] == [self.field1, self.field2, self.field3]
Example #20
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_printing_doesnt_crash(self): list_field = ListField([self.field1, self.field2]) print(list_field)
Example #21
Source File: babi.py From allennlp with Apache License 2.0 | 5 votes |
def text_to_instance( self, # type: ignore context: List[List[str]], question: List[str], answer: str, supports: List[int], ) -> Instance: fields: Dict[str, Field] = {} if self._keep_sentences: context_field_ks = ListField( [ TextField([Token(word) for word in line], self._token_indexers) for line in context ] ) fields["supports"] = ListField( [IndexField(support, context_field_ks) for support in supports] ) else: context_field = TextField( [Token(word) for line in context for word in line], self._token_indexers ) fields["context"] = context_field_ks if self._keep_sentences else context_field fields["question"] = TextField([Token(word) for word in question], self._token_indexers) fields["answer"] = TextField([Token(answer)], self._token_indexers) return Instance(fields)
Example #22
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_fields_can_pad_to_greater_than_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() padding_lengths["list_words___tokens"] = 7 padding_lengths["num_fields"] = 5 tensor_dict = list_field.as_tensor(padding_lengths) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0, 0, 0]), ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5, 0, 0]), ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0, 0, 0]), ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][3].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0]), ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][4].detach().cpu().numpy(), numpy.array([0, 0, 0, 0, 0, 0, 0]), )
Example #23
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_nested_list_fields_are_padded_correctly(self): nested_field1 = ListField([LabelField(c) for c in ["a", "b", "c", "d", "e"]]) nested_field2 = ListField([LabelField(c) for c in ["f", "g", "h", "i", "j", "k"]]) list_field = ListField([nested_field1.empty_field(), nested_field1, nested_field2]) list_field.index(self.vocab) padding_lengths = list_field.get_padding_lengths() assert padding_lengths == {"num_fields": 3, "list_num_fields": 6} tensor = list_field.as_tensor(padding_lengths).detach().cpu().numpy() numpy.testing.assert_almost_equal( tensor, [[-1, -1, -1, -1, -1, -1], [0, 1, 2, 3, 4, -1], [5, 6, 7, 8, 9, 10]] )
Example #24
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_all_fields_padded_to_max_length(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) tensor_dict = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][0].detach().cpu().numpy(), numpy.array([2, 3, 4, 5, 0]) ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][1].detach().cpu().numpy(), numpy.array([2, 3, 4, 1, 5]) ) numpy.testing.assert_array_almost_equal( tensor_dict["words"]["tokens"][2].detach().cpu().numpy(), numpy.array([2, 3, 1, 5, 0]) )
Example #25
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_list_field_can_handle_empty_sequence_label_fields(self): list_field = ListField( [self.sequence_label_field, self.sequence_label_field, self.empty_sequence_label_field] ) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor.detach().cpu().numpy(), numpy.array([[1, 1, 0, 1], [1, 1, 0, 1], [0, 0, 0, 0]]) )
Example #26
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_list_field_can_handle_empty_index_fields(self): list_field = ListField([self.index_field, self.index_field, self.empty_index_field]) list_field.index(self.vocab) tensor = list_field.as_tensor(list_field.get_padding_lengths()) numpy.testing.assert_array_equal( tensor.detach().cpu().numpy(), numpy.array([[1], [1], [-1]]) )
Example #27
Source File: list_field_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_get_padding_lengths(self): list_field = ListField([self.field1, self.field2, self.field3]) list_field.index(self.vocab) lengths = list_field.get_padding_lengths() assert lengths == {"num_fields": 3, "list_words___tokens": 5}
Example #28
Source File: elmo_indexer_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_elmo_empty_token_list(self): # Basic test indexer = ELMoTokenCharactersIndexer() assert {"elmo_tokens": []} == indexer.get_empty_token_list() # Real world test indexer = {"elmo": indexer} tokens_1 = TextField([Token("Apple")], indexer) targets_1 = ListField([TextField([Token("Apple")], indexer)]) tokens_2 = TextField([Token("Screen"), Token("device")], indexer) targets_2 = ListField( [TextField([Token("Screen")], indexer), TextField([Token("Device")], indexer)] ) instance_1 = Instance({"tokens": tokens_1, "targets": targets_1}) instance_2 = Instance({"tokens": tokens_2, "targets": targets_2}) a_batch = Batch([instance_1, instance_2]) a_batch.index_instances(Vocabulary()) batch_tensor = a_batch.as_tensor_dict() elmo_target_token_indices = batch_tensor["targets"]["elmo"]["elmo_tokens"] # The TextField that is empty should have been created using the # `get_empty_token_list` and then padded with zeros. empty_target = elmo_target_token_indices[0][1].numpy() np.testing.assert_array_equal(np.zeros((1, 50)), empty_target) non_empty_targets = [ elmo_target_token_indices[0][0], elmo_target_token_indices[1][0], elmo_target_token_indices[1][1], ] for non_empty_target in non_empty_targets: with pytest.raises(AssertionError): np.testing.assert_array_equal(np.zeros((1, 50)), non_empty_target)
Example #29
Source File: text_classification_json.py From allennlp with Apache License 2.0 | 5 votes |
def text_to_instance( self, text: str, label: Union[str, int] = None ) -> Instance: # type: ignore """ # Parameters text : `str`, required. The text to classify label : `str`, optional, (default = `None`). The label for this text. # Returns An `Instance` containing the following fields: - tokens (`TextField`) : The tokens in the sentence or phrase. - label (`LabelField`) : The label label of the sentence or phrase. """ fields: Dict[str, Field] = {} if self._segment_sentences: sentences: List[Field] = [] sentence_splits = self._sentence_segmenter.split_sentences(text) for sentence in sentence_splits: word_tokens = self._tokenizer.tokenize(sentence) if self._max_sequence_length is not None: word_tokens = self._truncate(word_tokens) sentences.append(TextField(word_tokens, self._token_indexers)) fields["tokens"] = ListField(sentences) else: tokens = self._tokenizer.tokenize(text) if self._max_sequence_length is not None: tokens = self._truncate(tokens) fields["tokens"] = TextField(tokens, self._token_indexers) if label is not None: fields["label"] = LabelField(label, skip_indexing=self._skip_label_indexing) return Instance(fields)
Example #30
Source File: wordnet.py From kb with Apache License 2.0 | 4 votes |
def text_to_instance(self, tokens: List[str], candidate_entities: List[List[str]], candidate_spans: List[List[int]], candidate_entity_prior: List[List[float]], gold_entities: List[str] = None, gold_data_ids: List[str] = None): # prior needs to be 2D and full # can look like [[0.2, 0.8], [1.0]] if one candidate for second # candidate span and two candidates for first max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) fields = { "tokens": TextField([Token(t) for t in tokens], token_indexers=self.token_indexers), # join by space, then retokenize in the "character indexer" "candidate_entities": TextField( [Token(" ".join(candidate_list)) for candidate_list in candidate_entities], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField(np.array(np_prior)), # only one sentence "candidate_segment_ids": ArrayField( np.array([0] * len(candidate_entities)), dtype=np.int ) } if gold_entities is not None: fields["gold_entities"] = TextField([Token(entity) for entity in gold_entities], token_indexers=self.entity_indexer) if gold_data_ids is not None: fields["gold_data_ids"] = MetadataField(gold_data_ids) span_fields = [] for span in candidate_spans: span_fields.append(SpanField(span[0], span[1], fields['tokens'])) fields['candidate_spans'] = ListField(span_fields) if self.extra_candidate_generators: tokens = " ".join(tokens) extra_candidates = { key: generator.get_mentions_raw_text(tokens, whitespace_tokenize=True) for key, generator in self.extra_candidate_generators.items() } fields['extra_candidates'] = MetadataField(extra_candidates) return Instance(fields, should_remap_span_indices=self.should_remap_span_indices)