Python Examples of allennlp.data.fields.TextField

Source File: instance_test.py From allennlp with Apache License 2.0

6 votes

def test_duplicate(self):
        # Verify the `duplicate()` method works with a `PretrainedTransformerIndexer` in
        # a `TextField`. See https://github.com/allenai/allennlp/issues/4270.
        instance = Instance(
            {
                "words": TextField(
                    [Token("hello")], {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
                )
            }
        )

        other = instance.duplicate()
        assert other == instance

        # Adding new fields to the original instance should not effect the duplicate.
        instance.add_field("labels", LabelField("some_label"))
        assert "labels" not in other.fields
        assert other != instance  # sanity check on the '__eq__' method.

Source File: dataset_reader.py From ConvLab with MIT License

6 votes

def text_to_instance(self, context_tokens: List[Token], tokens: List[Token], tags: List[str] = None,
        intents: List[str] = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        # print([t.text for t in context_tokens])
        fields["context_tokens"] = TextField(context_tokens, self._token_indexers)
        fields["tokens"] = TextField(tokens, self._token_indexers)
        fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        if tags is not None:
            fields["tags"] = SequenceLabelField(tags, fields["tokens"])
        if intents is not None:
            fields["intents"] = MultiLabelField(intents, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields)

Source File: data_loading.py From teaching with GNU General Public License v3.0

6 votes

def text_to_instance(self, query_sequence: str, doc_pos_sequence: str, doc_neg_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ
        query_tokenized = self._tokenizer.tokenize(query_sequence)
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_pos_tokenized = self._tokenizer.tokenize(doc_pos_sequence)
        if self.max_doc_length > -1:
            doc_pos_tokenized = doc_pos_tokenized[:self.max_doc_length]

        doc_pos_field = TextField(doc_pos_tokenized, self._token_indexers)

        doc_neg_tokenized = self._tokenizer.tokenize(doc_neg_sequence)
        if self.max_doc_length > -1:
            doc_neg_tokenized = doc_neg_tokenized[:self.max_doc_length]

        doc_neg_field = TextField(doc_neg_tokenized, self._token_indexers)

        return Instance({
            "query_tokens":query_field,
            "doc_pos_tokens":doc_pos_field,
            "doc_neg_tokens": doc_neg_field})

Source File: ebmnlp.py From scibert with Apache License 2.0

6 votes

def _read(self, file_path: str) -> Iterable[Instance]:
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    fields = [list(field) for field in zip(*fields)]
                    tokens_, _, _, pico_tags = fields
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens_]

                    yield self.text_to_instance(tokens, pico_tags)

Source File: semeval_2010_task_8_reader.py From DISTRE with Apache License 2.0

6 votes

def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         entity_1: Tuple[int],
                         entity_2: Tuple[int],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        
        tokens = [OpenAISplitter._standardize(token) for token in tokens]
        tokens = ['__start__'] + tokens[entity_1[0]:entity_1[1]+1] + ['__del1__'] + tokens[entity_2[0]:entity_2[1]+1] + ['__del2__'] + tokens + ['__clf__']
            
        sentence = TextField([Token(text=t) for t in tokens], self._token_indexers)
        fields['sentence'] = sentence
        #fields['entity1'] = SpanField(*entity_1, sequence_field=sentence)
        #fields['entity2'] = SpanField(*entity_2, sequence_field=sentence)
        
        if label:
            fields['label'] = LabelField(label)

        return Instance(fields)

Source File: data_loading.py From teaching with GNU General Public License v3.0

6 votes

def text_to_instance(self, query_id:str, doc_id:str, query_sequence: str, doc_sequence: str) -> Instance:  # type: ignore
        # pylint: disable=arguments-differ

        query_id_field = LabelField(int(query_id), skip_indexing=True)
        doc_id_field = LabelField(int(doc_id), skip_indexing=True)

        query_tokenized = self._tokenizer.tokenize(query_sequence)
        if self.max_query_length > -1:
            query_tokenized = query_tokenized[:self.max_query_length]

        query_field = TextField(query_tokenized, self._token_indexers)
        
        doc_tokenized = self._tokenizer.tokenize(doc_sequence)
        if self.max_doc_length > -1:
            doc_tokenized = doc_tokenized[:self.max_doc_length]

        doc_field = TextField(doc_tokenized, self._token_indexers)

        return Instance({
            "query_id":query_id_field,
            "doc_id":doc_id_field,
            "query_tokens":query_field,
            "doc_tokens":doc_field})

Source File: dataset_reader.py From ConvLab with MIT License

6 votes

def text_to_instance(self, tokens: List[Token], tags: List[str] = None, domain: str = None,
        intent: str = None, dialog_act: Dict[str, Any] = None) -> Instance:  # type: ignore
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        sequence = TextField(tokens, self._token_indexers)
        fields["tokens"] = sequence
        if tags:
            fields["tags"] = SequenceLabelField(tags, sequence)
        if domain:
            fields["domain"] = LabelField(domain, label_namespace="domain_labels")
        if intent:
            fields["intent"] = LabelField(intent, label_namespace="intent_labels")
        if dialog_act is not None:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens],
            'dialog_act': dialog_act})
        else:
            fields["metadata"] = MetadataField({"words": [x.text for x in tokens], 'dialog_act': {}})
        return Instance(fields)

Source File: dataset_test.py From allennlp with Apache License 2.0

6 votes

def get_instances(self):
        field1 = TextField(
            [Token(t) for t in ["this", "is", "a", "sentence", "."]], self.token_indexer
        )
        field2 = TextField(
            [Token(t) for t in ["this", "is", "a", "different", "sentence", "."]],
            self.token_indexer,
        )
        field3 = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]], self.token_indexer
        )
        field4 = TextField([Token(t) for t in ["this", "is", "short"]], self.token_indexer)
        instances = [
            Instance({"text1": field1, "text2": field2}),
            Instance({"text1": field3, "text2": field4}),
        ]
        return instances

Source File: vocabulary_test.py From allennlp with Apache License 2.0

6 votes

def test_saving_and_loading_works_with_byte_encoding(self):
        # We're going to set a vocabulary from a TextField using byte encoding, index it, save the
        # vocab, load the vocab, then index the text field again, and make sure we get the same
        # result.
        tokenizer = CharacterTokenizer(byte_encoding="utf-8")
        token_indexer = TokenCharactersIndexer(character_tokenizer=tokenizer, min_padding_length=2)
        tokens = [Token(t) for t in ["Øyvind", "für", "汉字"]]
        text_field = TextField(tokens, {"characters": token_indexer})
        dataset = Batch([Instance({"sentence": text_field})])
        vocab = Vocabulary.from_instances(dataset)
        text_field.index(vocab)
        indexed_tokens = deepcopy(text_field._indexed_tokens)

        vocab_dir = self.TEST_DIR / "vocab_save"
        vocab.save_to_files(vocab_dir)
        vocab2 = Vocabulary.from_files(vocab_dir)
        text_field2 = TextField(tokens, {"characters": token_indexer})
        text_field2.index(vocab2)
        indexed_tokens2 = deepcopy(text_field2._indexed_tokens)
        assert indexed_tokens == indexed_tokens2

Source File: vocabulary_test.py From allennlp with Apache License 2.0

6 votes

def test_from_params_extend_config(self):

        vocab_dir = self.TEST_DIR / "vocab_save"
        original_vocab = Vocabulary(non_padded_namespaces=["tokens"])
        original_vocab.add_token_to_namespace("a", namespace="tokens")
        original_vocab.save_to_files(vocab_dir)

        text_field = TextField(
            [Token(t) for t in ["a", "b"]], {"tokens": SingleIdTokenIndexer("tokens")}
        )
        instances = Batch([Instance({"text": text_field})])

        # If you ask to extend vocab from `directory`, instances must be passed
        # in Vocabulary constructor, or else there is nothing to extend to.
        params = Params({"type": "extend", "directory": vocab_dir})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params)

        # If you ask to extend vocab, `directory` key must be present in params,
        # or else there is nothing to extend from.
        params = Params({"type": "extend"})
        with pytest.raises(ConfigurationError):
            _ = Vocabulary.from_params(params, instances=instances)

Source File: vocabulary_test.py From allennlp with Apache License 2.0

6 votes

def test_max_vocab_size_partial_dict(self):
        indexers = {
            "tokens": SingleIdTokenIndexer(),
            "token_characters": TokenCharactersIndexer(min_padding_length=3),
        }
        instance = Instance(
            {
                "text": TextField(
                    [Token(w) for w in "Abc def ghi jkl mno pqr stu vwx yz".split(" ")], indexers
                )
            }
        )
        dataset = Batch([instance])
        params = Params({"max_vocab_size": {"tokens": 1}})

        vocab = Vocabulary.from_params(params=params, instances=dataset)
        assert len(vocab.get_index_to_token_vocabulary("tokens").values()) == 3  # 1 + 2
        assert len(vocab.get_index_to_token_vocabulary("token_characters").values()) == 28  # 26 + 2

Source File: index_field_test.py From allennlp with Apache License 2.0

6 votes

def test_equality(self):
        index_field1 = IndexField(4, self.text)
        index_field2 = IndexField(4, self.text)
        index_field3 = IndexField(
            4,
            TextField(
                [Token(t) for t in ["AllenNLP", "is", "the", "bomb", "!"]],
                {"words": SingleIdTokenIndexer("words")},
            ),
        )

        assert index_field1 == 4
        assert index_field1 == index_field1
        assert index_field1 == index_field2

        assert index_field1 != index_field3
        assert index_field2 != index_field3
        assert index_field3 == index_field3

Source File: text_field_test.py From allennlp with Apache License 2.0

6 votes

def test_as_tensor_handles_characters(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={
                "characters": TokenCharactersIndexer("characters", min_padding_length=1)
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        expected_character_array = numpy.array(
            [
                [1, 1, 1, 3, 0, 0, 0, 0],
                [1, 3, 0, 0, 0, 0, 0, 0],
                [1, 0, 0, 0, 0, 0, 0, 0],
                [3, 4, 5, 6, 4, 5, 7, 4],
                [1, 0, 0, 0, 0, 0, 0, 0],
            ]
        )
        numpy.testing.assert_array_almost_equal(
            tensor_dict["characters"]["token_characters"].detach().cpu().numpy(),
            expected_character_array,
        )

Source File: text_field_test.py From allennlp with Apache License 2.0

6 votes

def test_token_padding_lengths_are_computed_correctly(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict": DictReturningTokenIndexer(token_min_padding_length=3),
                "words": SingleIdTokenIndexer("words", token_min_padding_length=3),
                "characters": TokenCharactersIndexer(
                    "characters", min_padding_length=1, token_min_padding_length=3
                ),
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "field_with_dict___token_ids": 5,
            "field_with_dict___additional_key": 3,
            "words___tokens": 3,
            "characters___token_characters": 3,
            "characters___num_token_characters": 8,
        }
        tensors = field.as_tensor(padding_lengths)
        assert tensors["field_with_dict"]["additional_key"].tolist()[-1] == 0
        assert tensors["words"]["tokens"].tolist()[-1] == 0
        assert tensors["characters"]["token_characters"].tolist()[-1] == [0] * 8

Source File: conll2003.py From magnitude with MIT License

6 votes

def _read(self, file_path     )                      :
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, u"r") as data_file:
            logger.info(u"Reading instances from lines in file at: %s", file_path)

            # Group into alternative divider / sentence chunks.
            for is_divider, lines in itertools.groupby(data_file, _is_divider):
                # Ignore the divider chunks, so that `lines` corresponds to the words
                # of a single sentence.
                if not is_divider:
                    fields = [line.strip().split() for line in lines]
                    # unzipping trick returns tuples, but our Fields need lists
                    tokens, pos_tags, chunk_tags, ner_tags = [list(field) for field in izip(*fields)]
                    # TextField requires ``Token`` objects
                    tokens = [Token(token) for token in tokens]

                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)

Source File: snli.py From magnitude with MIT License

6 votes

def text_to_instance(self,  # type: ignore
                         premise     ,
                         hypothesis     ,
                         label      = None)            :
        # pylint: disable=arguments-differ
        fields                   = {}
        premise_tokens = self._tokenizer.tokenize(premise)
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)
        fields[u'premise'] = TextField(premise_tokens, self._token_indexers)
        fields[u'hypothesis'] = TextField(hypothesis_tokens, self._token_indexers)
        if label:
            fields[u'label'] = LabelField(label)

        metadata = {u"premise_tokens": [x.text for x in premise_tokens],
                    u"hypothesis_tokens": [x.text for x in hypothesis_tokens]}
        fields[u"metadata"] = MetadataField(metadata)
        return Instance(fields)

Source File: semantic_role_labeling.py From magnitude with MIT License

6 votes

def text_to_instance(self,  # type: ignore
                         tokens             ,
                         verb_label           ,
                         tags            = None)            :
        u"""
        We take `pre-tokenized` input here, along with a verb label.  The verb label should be a
        one-hot binary vector, the same length as the tokens, indicating the position of the verb
        to find arguments for.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        text_field = TextField(tokens, token_indexers=self._token_indexers)
        fields[u'tokens'] = text_field
        fields[u'verb_indicator'] = SequenceLabelField(verb_label, text_field)
        if tags:
            fields[u'tags'] = SequenceLabelField(tags, text_field)

        if all([x == 0 for x in verb_label]):
            verb = None
        else:
            verb = tokens[verb_label.index(1)].text
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens],
                                            u"verb": verb})
        return Instance(fields)

Source File: seq2seq.py From magnitude with MIT License

5 votes

def text_to_instance(self, source_string     , target_string      = None)            :  # type: ignore
        # pylint: disable=arguments-differ
        tokenized_source = self._source_tokenizer.tokenize(source_string)
        if self._source_add_start_token:
            tokenized_source.insert(0, Token(START_SYMBOL))
        tokenized_source.append(Token(END_SYMBOL))
        source_field = TextField(tokenized_source, self._source_token_indexers)
        if target_string is not None:
            tokenized_target = self._target_tokenizer.tokenize(target_string)
            tokenized_target.insert(0, Token(START_SYMBOL))
            tokenized_target.append(Token(END_SYMBOL))
            target_field = TextField(tokenized_target, self._target_token_indexers)
            return Instance({u"source_tokens": source_field, u"target_tokens": target_field})
        else:
            return Instance({u'source_tokens': source_field})

Source File: ebmnlp.py From scibert with Apache License 2.0

5 votes

def text_to_instance(self,
                         tokens: List[Token],
                         pico_tags: List[str] = None):
        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {'tokens': sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        
        # Set the field 'labels' according to the specified PIO element
        if pico_tags is not None:
            instance_fields['tags'] = SequenceLabelField(pico_tags, sequence, self.label_namespace)

        return Instance(instance_fields)

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_as_tensor_handles_longer_lengths(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        padding_lengths["words___tokens"] = 10
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"].detach().cpu().numpy(),
            numpy.array([1, 1, 1, 2, 1, 0, 0, 0, 0, 0]),
        )

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_as_tensor_handles_words(self):
        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        tensor_dict = field.as_tensor(padding_lengths)
        numpy.testing.assert_array_almost_equal(
            tensor_dict["words"]["tokens"].detach().cpu().numpy(), numpy.array([1, 1, 1, 2, 1])
        )

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_get_padding_lengths_raises_if_no_indexed_tokens(self):

        field = TextField(
            [Token(t) for t in ["This", "is", "a", "sentence", "."]],
            token_indexers={"words": SingleIdTokenIndexer("words")},
        )
        with pytest.raises(ConfigurationError):
            field.get_padding_lengths()

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_index_converts_field_correctly(self):
        vocab = Vocabulary()
        sentence_index = vocab.add_token_to_namespace("sentence", namespace="words")
        capital_a_index = vocab.add_token_to_namespace("A", namespace="words")
        capital_a_char_index = vocab.add_token_to_namespace("A", namespace="characters")
        s_index = vocab.add_token_to_namespace("s", namespace="characters")
        e_index = vocab.add_token_to_namespace("e", namespace="characters")
        n_index = vocab.add_token_to_namespace("n", namespace="characters")
        t_index = vocab.add_token_to_namespace("t", namespace="characters")
        c_index = vocab.add_token_to_namespace("c", namespace="characters")

        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"words": SingleIdTokenIndexer(namespace="words")},
        )
        field.index(vocab)

        assert field._indexed_tokens["words"]["tokens"] == [capital_a_index, sentence_index]

        field1 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            {"characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1)},
        )
        field1.index(vocab)
        assert field1._indexed_tokens["characters"]["token_characters"] == [
            [capital_a_char_index],
            [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index],
        ]
        field2 = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "words": SingleIdTokenIndexer(namespace="words"),
                "characters": TokenCharactersIndexer(namespace="characters", min_padding_length=1),
            },
        )
        field2.index(vocab)
        assert field2._indexed_tokens["words"]["tokens"] == [capital_a_index, sentence_index]
        assert field2._indexed_tokens["characters"]["token_characters"] == [
            [capital_a_char_index],
            [s_index, e_index, n_index, t_index, e_index, n_index, c_index, e_index],
        ]

Source File: ontonotes_ner.py From magnitude with MIT License

5 votes

def text_to_instance(self, # type: ignore
                         tokens             ,
                         ner_tags            = None)            :
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        sequence = TextField(tokens, self._token_indexers)
        instance_fields                   = {u'tokens': sequence}
        instance_fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        # Add "tag label" to instance
        if ner_tags is not None:
            if self._coding_scheme == u"BIOUL":
                ner_tags = to_bioul(ner_tags, encoding=u"BIO")
            instance_fields[u'tags'] = SequenceLabelField(ner_tags, sequence)
        return Instance(instance_fields)

Source File: classification_dataset_reader.py From scibert with Apache License 2.0

5 votes

def text_to_instance(self,
                         text: str,
                         label: str = None,
                         metadata: Any = None) -> Instance:  # type: ignore
        text_tokens = self._tokenizer.tokenize(text)
        fields = {
            'text': TextField(text_tokens, self._token_indexers),
        }
        if label is not None:
            fields['label'] = LabelField(label)

        if metadata:
            fields['metadata'] = MetadataField(metadata)
        return Instance(fields)

Source File: index_field_test.py From allennlp with Apache License 2.0

5 votes

def setup_method(self):
        super().setup_method()
        self.text = TextField(
            [Token(t) for t in ["here", "is", "a", "sentence", "."]],
            {"words": SingleIdTokenIndexer("words")},
        )

Source File: list_field_test.py From allennlp with Apache License 2.0

5 votes

def test_empty_list_can_be_tensorized(self):
        tokenizer = SpacyTokenizer()
        tokens = tokenizer.tokenize("Foo")
        text_field = TextField(tokens, self.word_indexer)
        list_field = ListField([text_field.empty_field()])
        fields = {
            "list": list_field,
            "bar": TextField(tokenizer.tokenize("BAR"), self.word_indexer),
        }
        instance = Instance(fields)
        instance.index_fields(self.vocab)
        instance.as_tensor_dict()

Source File: sequence_tagging.py From magnitude with MIT License

5 votes

def text_to_instance(self, tokens             , tags            = None)            :  # type: ignore
        u"""
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """
        # pylint: disable=arguments-differ
        fields                   = {}
        sequence = TextField(tokens, self._token_indexers)
        fields[u"tokens"] = sequence
        fields[u"metadata"] = MetadataField({u"words": [x.text for x in tokens]})
        if tags is not None:
            fields[u"tags"] = SequenceLabelField(tags, sequence)
        return Instance(fields)

Source File: span_field_test.py From allennlp with Apache License 2.0

5 votes

def test_equality(self):
        span_field1 = SpanField(2, 3, self.text)
        span_field2 = SpanField(2, 3, self.text)
        span_field3 = SpanField(
            2, 3, TextField([Token(t) for t in ["not", "the", "same", "tokens"]], self.indexers)
        )

        assert span_field1 == (2, 3)
        assert span_field1 == span_field1
        assert span_field1 == span_field2
        assert span_field1 != span_field3
        assert span_field2 != span_field3

Source File: text_field_test.py From allennlp with Apache License 2.0

5 votes

def test_token_indexer_returns_dict(self):
        field = TextField(
            [Token(t) for t in ["A", "sentence"]],
            token_indexers={
                "field_with_dict": DictReturningTokenIndexer(),
                "words": SingleIdTokenIndexer("words"),
                "characters": TokenCharactersIndexer("characters", min_padding_length=1),
            },
        )
        field.index(self.vocab)
        padding_lengths = field.get_padding_lengths()
        assert padding_lengths == {
            "field_with_dict___token_ids": 5,
            "field_with_dict___additional_key": 2,
            "words___tokens": 2,
            "characters___token_characters": 2,
            "characters___num_token_characters": 8,
        }
        padding_lengths["field_with_dict___token_ids"] = 7
        padding_lengths["field_with_dict___additional_key"] = 3
        padding_lengths["words___tokens"] = 4
        padding_lengths["characters___token_characters"] = 4
        tensors = field.as_tensor(padding_lengths)
        assert list(tensors["field_with_dict"]["token_ids"].shape) == [7]
        assert list(tensors["field_with_dict"]["additional_key"].shape) == [3]
        assert list(tensors["words"]["tokens"].shape) == [4]
        assert list(tensors["characters"]["token_characters"].shape) == [4, 8]

Python allennlp.data.fields.TextField() Examples