Python Examples of allennlp.common.util.pad_sequence_to

Source File: token_indexer.py From allennlp with Apache License 2.0

6 votes

def as_padded_tensor_dict(
        self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]
    ) -> Dict[str, torch.Tensor]:
        """
        This method pads a list of tokens given the input padding lengths (which could actually
        truncate things, depending on settings) and returns that padded list of input tokens as a
        `Dict[str, torch.Tensor]`.  This is a dictionary because there should be one key per
        argument that the `TokenEmbedder` corresponding to this class expects in its `forward()`
        method (where the argument name in the `TokenEmbedder` needs to make the key in this
        dictionary).

        The base class implements the case when all you want to do is create a padded `LongTensor`
        for every list in the `tokens` dictionary.  If your `TokenIndexer` needs more complex
        logic than that, you need to override this method.
        """
        tensor_dict = {}
        for key, val in tokens.items():
            if val and isinstance(val[0], bool):
                tensor = torch.BoolTensor(
                    pad_sequence_to_length(val, padding_lengths[key], default_value=lambda: False)
                )
            else:
                tensor = torch.LongTensor(pad_sequence_to_length(val, padding_lengths[key]))
            tensor_dict[key] = tensor
        return tensor_dict

Source File: knowledge_graph_field.py From allennlp-semparse with Apache License 2.0

6 votes

def as_tensor(self, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        text_tensors = self._entity_text_field.as_tensor(padding_lengths)
        padded_linking_features = util.pad_sequence_to_length(
            self.linking_features, padding_lengths["num_entities"], default_value=lambda: []
        )
        padded_linking_arrays = []

        def default_feature_value():
            return [0.0] * len(self._feature_extractors)

        for linking_features in padded_linking_features:
            padded_features = util.pad_sequence_to_length(
                linking_features,
                padding_lengths["num_utterance_tokens"],
                default_value=default_feature_value,
            )
            padded_linking_arrays.append(padded_features)
        linking_features_tensor = torch.FloatTensor(padded_linking_arrays)
        return {"text": text_tensors, "linking": linking_features_tensor}

Source File: list_field.py From magnitude with MIT License

6 votes

def as_tensor(self,
                  padding_lengths                ,
                  cuda_device      = -1)             :
        padded_field_list = pad_sequence_to_length(self.field_list,
                                                   padding_lengths[u'num_fields'],
                                                   self.field_list[0].empty_field)
        # Here we're removing the scoping on the padding length keys that we added in
        # `get_padding_lengths`; see the note there for more detail.
        child_padding_lengths = dict((key.replace(u'list_', u'', 1), value)
                                 for key, value in list(padding_lengths.items())
                                 if key.startswith(u'list_'))
        padded_fields = [field.as_tensor(child_padding_lengths, cuda_device)
                         for field in padded_field_list]
        return self.field_list[0].batch_tensors(padded_fields)

    #overrides

Source File: knowledge_graph_field.py From stog with MIT License

5 votes

def as_tensor(self, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tensors = {}
        desired_num_entities = padding_lengths['num_entities']
        desired_num_entity_tokens = padding_lengths['num_entity_tokens']
        desired_num_utterance_tokens = padding_lengths['num_utterance_tokens']
        for indexer_name, indexer in self._token_indexers.items():
            padded_entities = util.pad_sequence_to_length(self._indexed_entity_texts[indexer_name],
                                                          desired_num_entities,
                                                          default_value=lambda: [])
            padded_arrays = []
            for padded_entity in padded_entities:
                padded_array = indexer.pad_token_sequence({'key': padded_entity},
                                                          {'key': desired_num_entity_tokens},
                                                          padding_lengths)['key']
                padded_arrays.append(padded_array)
            tensor = torch.LongTensor(padded_arrays)
            tensors[indexer_name] = tensor
        padded_linking_features = util.pad_sequence_to_length(self.linking_features,
                                                              desired_num_entities,
                                                              default_value=lambda: [])
        padded_linking_arrays = []
        default_feature_value = lambda: [0.0] * len(self._feature_extractors)
        for linking_features in padded_linking_features:
            padded_features = util.pad_sequence_to_length(linking_features,
                                                          desired_num_utterance_tokens,
                                                          default_value=default_feature_value)
            padded_linking_arrays.append(padded_features)
        linking_features_tensor = torch.FloatTensor(padded_linking_arrays)
        return {'text': tensors, 'linking': linking_features_tensor}

Source File: byte_pair_indexer.py From DISTRE with Apache License 2.0

5 votes

def pad_token_sequence(self,
                           tokens: Dict[str, List[int]],
                           desired_num_tokens: Dict[str, int],
                           padding_lengths: Dict[str, int]) -> Dict[str, List[int]]:  # pylint: disable=unused-argument
        return {key: pad_sequence_to_length(val, desired_num_tokens[key])
                for key, val in tokens.items()}

Source File: tiny_single_id.py From optuna with MIT License

5 votes

def as_padded_tensor(
        self,
        tokens: Dict[str, List[int]],
        desired_num_tokens: Dict[str, int],
        padding_lengths: Dict[str, int],
    ) -> Dict[str, torch.Tensor]:
        return {
            key: torch.LongTensor(pad_sequence_to_length(val, desired_num_tokens[key]))
            for key, val in tokens.items()
        }

Source File: fasttext_token_indexer.py From sigir19-neural-ir with Apache License 2.0

5 votes

def pad_token_sequence(self,
                           tokens: Dict[str, List[List[int]]],
                           desired_num_tokens: Dict[str, int],
                           padding_lengths: Dict[str, int]) -> Dict[str, List[List[int]]]:
        # pylint: disable=unused-argument
        return {key: torch.stack(pad_sequence_to_length(val, desired_num_tokens[key],
                                                        default_value=self._default_value_for_padding)).long()
                for key, val in tokens.items()}

Source File: knowledge_graph_field.py From gtos with MIT License

5 votes

def as_tensor(self, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tensors = {}
        desired_num_entities = padding_lengths['num_entities']
        desired_num_entity_tokens = padding_lengths['num_entity_tokens']
        desired_num_utterance_tokens = padding_lengths['num_utterance_tokens']
        for indexer_name, indexer in self._token_indexers.items():
            padded_entities = util.pad_sequence_to_length(self._indexed_entity_texts[indexer_name],
                                                          desired_num_entities,
                                                          default_value=lambda: [])
            padded_arrays = []
            for padded_entity in padded_entities:
                padded_array = indexer.pad_token_sequence({'key': padded_entity},
                                                          {'key': desired_num_entity_tokens},
                                                          padding_lengths)['key']
                padded_arrays.append(padded_array)
            tensor = torch.LongTensor(padded_arrays)
            tensors[indexer_name] = tensor
        padded_linking_features = util.pad_sequence_to_length(self.linking_features,
                                                              desired_num_entities,
                                                              default_value=lambda: [])
        padded_linking_arrays = []
        default_feature_value = lambda: [0.0] * len(self._feature_extractors)
        for linking_features in padded_linking_features:
            padded_features = util.pad_sequence_to_length(linking_features,
                                                          desired_num_utterance_tokens,
                                                          default_value=default_feature_value)
            padded_linking_arrays.append(padded_features)
        linking_features_tensor = torch.FloatTensor(padded_linking_arrays)
        return {'text': tensors, 'linking': linking_features_tensor}

Source File: copy_map_field.py From nlp-models with MIT License

5 votes

def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
        desired_length = padding_lengths["num_tokens"]
        padded_tokens = pad_sequence_to_length(self._mapping_array, desired_length)
        tensor = torch.LongTensor(padded_tokens)
        return tensor

Source File: test_util.py From magnitude with MIT License

5 votes

def test_pad_sequence_to_length(self):
        assert util.pad_sequence_to_length([1, 2, 3], 5) == [1, 2, 3, 0, 0]
        assert util.pad_sequence_to_length([1, 2, 3], 5, default_value=lambda: 2) == [1, 2, 3, 2, 2]
        assert util.pad_sequence_to_length([1, 2, 3], 5, padding_on_right=False) == [0, 0, 1, 2, 3]

Source File: text_field_test.py From magnitude with MIT License

5 votes

def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items()))

Source File: sequence_label_field.py From magnitude with MIT License

5 votes

def as_tensor(self,
                  padding_lengths                ,
                  cuda_device      = -1)                :
        desired_num_tokens = padding_lengths[u'num_tokens']
        padded_tags = pad_sequence_to_length(self._indexed_labels, desired_num_tokens)
        tensor = torch.LongTensor(padded_tags)
        return tensor if cuda_device == -1 else tensor.cuda(cuda_device)

    #overrides

Source File: knowledge_graph_field.py From magnitude with MIT License

5 votes

def as_tensor(self,
                  padding_lengths                ,
                  cuda_device      = -1)                           :
        tensors = {}
        desired_num_entities = padding_lengths[u'num_entities']
        desired_num_entity_tokens = padding_lengths[u'num_entity_tokens']
        desired_num_utterance_tokens = padding_lengths[u'num_utterance_tokens']
        for indexer_name, indexer in list(self._token_indexers.items()):
            padded_entities = util.pad_sequence_to_length(self._indexed_entity_texts[indexer_name],
                                                          desired_num_entities,
                                                          default_value=lambda: [])
            padded_arrays = []
            for padded_entity in padded_entities:
                padded_array = indexer.pad_token_sequence({u'key': padded_entity},
                                                          {u'key': desired_num_entity_tokens},
                                                          padding_lengths)[u'key']
                padded_arrays.append(padded_array)
            tensor = torch.LongTensor(padded_arrays)
            tensors[indexer_name] = tensor if cuda_device == -1 else tensor.cuda(cuda_device)
        padded_linking_features = util.pad_sequence_to_length(self.linking_features,
                                                              desired_num_entities,
                                                              default_value=lambda: [])
        padded_linking_arrays = []
        default_feature_value = lambda: [0.0] * len(self._feature_extractors)
        for linking_features in padded_linking_features:
            padded_features = util.pad_sequence_to_length(linking_features,
                                                          desired_num_utterance_tokens,
                                                          default_value=default_feature_value)
            padded_linking_arrays.append(padded_features)
        linking_features_tensor = torch.FloatTensor(padded_linking_arrays)
        if cuda_device != -1:
            linking_features_tensor = linking_features_tensor.cuda(cuda_device)
        return {u'text': tensors, u'linking': linking_features_tensor}

Source File: token_characters_indexer.py From magnitude with MIT License

5 votes

def pad_token_sequence(self,
                           tokens                            ,
                           desired_num_tokens                ,
                           padding_lengths                )                              :
        # Pad the tokens.
        # tokens has only one key...
        key = list(tokens.keys())[0]

        padded_tokens = pad_sequence_to_length(
                tokens[key], desired_num_tokens[key],
                default_value=self.get_padding_token
        )

        # Pad the characters within the tokens.
        desired_token_length = padding_lengths[u'num_token_characters']
        longest_token            = max(tokens[key], key=len, default=[])
        padding_value = 0
        if desired_token_length > len(longest_token):
            # Since we want to pad to greater than the longest token, we add a
            # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest.
            padded_tokens.append([padding_value] * desired_token_length)
        # pad the list of lists to the longest sublist, appending 0's
        padded_tokens = list(izip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value)))
        if desired_token_length > len(longest_token):
            # Removes the "dummy token".
            padded_tokens.pop()
        # Truncates all the tokens to the desired length, and return the result.
        return {key: [list(token[:desired_token_length]) for token in padded_tokens]}

Source File: features_field.py From scitail with Apache License 2.0

5 votes

def as_array(self, padding_lengths: Dict[str, int]) -> numpy.array:
        padded_features = pad_sequence_to_length(self.features,
                                                 padding_lengths['num_features'],
                                                 (lambda: math.nan))
        return numpy.asarray(padded_features, dtype=numpy.float32)

Source File: wordpiece_indexer.py From NLP_Toolkit with Apache License 2.0

5 votes

def pad_token_sequence(self,
                           tokens: Dict[str, List[int]],
                           desired_num_tokens: Dict[str, int],
                           padding_lengths: Dict[str, int]) -> Dict[str, List[int]]:  # pylint: disable=unused-argument
        return {key: pad_sequence_to_length(val, desired_num_tokens[key])
                for key, val in tokens.items()}

Source File: token_characters_indexer.py From allennlp with Apache License 2.0

5 votes

def as_padded_tensor_dict(
        self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]
    ) -> Dict[str, torch.Tensor]:
        # Pad the tokens.
        padded_tokens = pad_sequence_to_length(
            tokens["token_characters"],
            padding_lengths["token_characters"],
            default_value=lambda: [],
        )

        # Pad the characters within the tokens.
        desired_token_length = padding_lengths["num_token_characters"]
        longest_token: List[int] = max(tokens["token_characters"], key=len, default=[])  # type: ignore
        padding_value = 0
        if desired_token_length > len(longest_token):
            # Since we want to pad to greater than the longest token, we add a
            # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest.
            padded_tokens.append([padding_value] * desired_token_length)
        # pad the list of lists to the longest sublist, appending 0's
        padded_tokens = list(zip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value)))
        if desired_token_length > len(longest_token):
            # Removes the "dummy token".
            padded_tokens.pop()
        # Truncates all the tokens to the desired length, and return the result.
        return {
            "token_characters": torch.LongTensor(
                [list(token[:desired_token_length]) for token in padded_tokens]
            )
        }

Source File: namespace_swapping_field.py From allennlp with Apache License 2.0

5 votes

def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
        desired_length = padding_lengths["num_tokens"]
        padded_tokens = pad_sequence_to_length(self._mapping_array, desired_length)
        tensor = torch.LongTensor(padded_tokens)
        return tensor

Source File: list_field.py From allennlp with Apache License 2.0

5 votes

def as_tensor(self, padding_lengths: Dict[str, int]) -> DataArray:
        padded_field_list = pad_sequence_to_length(
            self.field_list, padding_lengths["num_fields"], self.field_list[0].empty_field
        )
        # Here we're removing the scoping on the padding length keys that we added in
        # `get_padding_lengths`; see the note there for more detail.
        child_padding_lengths = {
            key.replace("list_", "", 1): value
            for key, value in padding_lengths.items()
            if key.startswith("list_")
        }
        padded_fields = [field.as_tensor(child_padding_lengths) for field in padded_field_list]
        return self.field_list[0].batch_tensors(padded_fields)

Source File: sequence_label_field.py From allennlp with Apache License 2.0

5 votes

def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
        desired_num_tokens = padding_lengths["num_tokens"]
        padded_tags = pad_sequence_to_length(self._indexed_labels, desired_num_tokens)
        tensor = torch.LongTensor(padded_tags)
        return tensor

Source File: dep_label_indexer.py From magnitude with MIT License

5 votes

def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in list(tokens.items()))

Source File: bert_pretrained.py From udify with MIT License

5 votes

def as_padded_tensor(
            self,
            tokens: Dict[str, List[int]],
            desired_num_tokens: Dict[str, int],
            padding_lengths: Dict[str, int],
    ) -> Dict[str, torch.Tensor]:
        return {
            key: torch.LongTensor(pad_sequence_to_length(val, desired_num_tokens[key]))
            for key, val in tokens.items()
        }

Source File: elmo_indexer.py From magnitude with MIT License

5 votes

def pad_token_sequence(self,
                           tokens                            ,
                           desired_num_tokens                ,
                           padding_lengths                )                              :
        # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key],
                                            default_value=self._default_value_for_padding))
                for key, val in list(tokens.items()))

Source File: ner_tag_indexer.py From magnitude with MIT License

5 votes

def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in list(tokens.items()))

Source File: openai_transformer_byte_pair_indexer.py From magnitude with MIT License

5 votes

def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in tokens.items())

Source File: pos_tag_indexer.py From magnitude with MIT License

5 votes

def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in list(tokens.items()))

Source File: wikitables_semantic_parser.py From magnitude with MIT License

4 votes

def _get_type_vector(worlds                       ,
                         num_entities     ,
                         tensor              )                                           :
        u"""
        Produces the one hot encoding for each entity's type. In addition,
        a map from a flattened entity index to type is returned to combine
        entity type operations into one method.

        Parameters
        ----------
        worlds : ``List[WikiTablesWorld]``
        num_entities : ``int``
        tensor : ``torch.Tensor``
            Used for copying the constructed list onto the right device.

        Returns
        -------
        A ``torch.LongTensor`` with shape ``(batch_size, num_entities, num_types)``.
        entity_types : ``Dict[int, int]``
            This is a mapping from ((batch_index * num_entities) + entity_index) to entity type id.
        """
        entity_types = {}
        batch_types = []
        for batch_index, world in enumerate(worlds):
            types = []
            for entity_index, entity in enumerate(world.table_graph.entities):
                one_hot_vectors = [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
                # We need numbers to be first, then cells, then parts, then row, because our
                # entities are going to be sorted.  We do a split by type and then a merge later,
                # and it relies on this sorting.
                if entity.startswith(u'fb:cell'):
                    entity_type = 1
                elif entity.startswith(u'fb:part'):
                    entity_type = 2
                elif entity.startswith(u'fb:row'):
                    entity_type = 3
                else:
                    entity_type = 0
                types.append(one_hot_vectors[entity_type])

                # For easier lookups later, we're actually using a _flattened_ version
                # of (batch_index, entity_index) for the key, because this is how the
                # linking scores are stored.
                flattened_entity_index = batch_index * num_entities + entity_index
                entity_types[flattened_entity_index] = entity_type
            padded = pad_sequence_to_length(types, num_entities, lambda: [0, 0, 0, 0])
            batch_types.append(padded)
        return tensor.new_tensor(batch_types), entity_types

Source File: atis_semantic_parser.py From allennlp-semparse with Apache License 2.0

4 votes

def _get_type_vector(
        worlds: List[AtisWorld], num_entities: int, tensor: torch.Tensor = None
    ) -> Tuple[torch.LongTensor, Dict[int, int]]:
        """
        Produces the encoding for each entity's type. In addition, a map from a flattened entity
        index to type is returned to combine entity type operations into one method.

        Parameters
        ----------
        worlds : ``List[AtisWorld]``
        num_entities : ``int``
        tensor : ``torch.Tensor``
            Used for copying the constructed list onto the right device.

        Returns
        -------
        A ``torch.LongTensor`` with shape ``(batch_size, num_entities, num_types)``.
        entity_types : ``Dict[int, int]``
            This is a mapping from ((batch_index * num_entities) + entity_index) to entity type id.
        """
        entity_types = {}
        batch_types = []

        for batch_index, world in enumerate(worlds):
            types = []
            entities = [
                ("number", entity)
                if any(
                    [
                        entity.startswith(numeric_nonterminal)
                        for numeric_nonterminal in NUMERIC_NONTERMINALS
                    ]
                )
                else ("string", entity)
                for entity in world.entities
            ]

            for entity_index, entity in enumerate(entities):
                # We need numbers to be first, then strings, since our entities are going to be
                # sorted. We do a split by type and then a merge later, and it relies on this sorting.
                if entity[0] == "number":
                    entity_type = 1
                else:
                    entity_type = 0
                types.append(entity_type)

                # For easier lookups later, we're actually using a _flattened_ version
                # of (batch_index, entity_index) for the key, because this is how the
                # linking scores are stored.
                flattened_entity_index = batch_index * num_entities + entity_index
                entity_types[flattened_entity_index] = entity_type
            padded = pad_sequence_to_length(types, num_entities, lambda: 0)
            batch_types.append(padded)

        return tensor.new_tensor(batch_types, dtype=torch.long), entity_types

Source File: wikitables_semantic_parser.py From allennlp-semparse with Apache License 2.0

4 votes

def _get_neighbor_indices(
        worlds: List[WikiTablesLanguage], num_entities: int, tensor: torch.Tensor
    ) -> torch.LongTensor:
        """
        This method returns the indices of each entity's neighbors. A tensor
        is accepted as a parameter for copying purposes.

        Parameters
        ----------
        worlds : ``List[WikiTablesLanguage]``
        num_entities : ``int``
        tensor : ``torch.Tensor``
            Used for copying the constructed list onto the right device.

        Returns
        -------
        A ``torch.LongTensor`` with shape ``(batch_size, num_entities, num_neighbors)``. It is padded
        with -1 instead of 0, since 0 is a valid neighbor index. If all the entities in the batch
        have no neighbors, None will be returned.
        """

        num_neighbors = 0
        for world in worlds:
            for entity in world.table_graph.entities:
                if len(world.table_graph.neighbors[entity]) > num_neighbors:
                    num_neighbors = len(world.table_graph.neighbors[entity])

        batch_neighbors = []
        no_entities_have_neighbors = True
        for world in worlds:
            # Each batch instance has its own world, which has a corresponding table.
            entities = world.table_graph.entities
            entity2index = {entity: i for i, entity in enumerate(entities)}
            entity2neighbors = world.table_graph.neighbors
            neighbor_indexes = []
            for entity in entities:
                entity_neighbors = [entity2index[n] for n in entity2neighbors[entity]]
                if entity_neighbors:
                    no_entities_have_neighbors = False
                # Pad with -1 instead of 0, since 0 represents a neighbor index.
                padded = pad_sequence_to_length(entity_neighbors, num_neighbors, lambda: -1)
                neighbor_indexes.append(padded)
            neighbor_indexes = pad_sequence_to_length(
                neighbor_indexes, num_entities, lambda: [-1] * num_neighbors
            )
            batch_neighbors.append(neighbor_indexes)
        # It is possible that none of the entities has any neighbors, since our definition of the
        # knowledge graph allows it when no entities or numbers were extracted from the question.
        if no_entities_have_neighbors:
            return None
        return tensor.new_tensor(batch_neighbors, dtype=torch.long)

Source File: wikitables_semantic_parser.py From allennlp-semparse with Apache License 2.0

4 votes

def _get_type_vector(
        worlds: List[WikiTablesLanguage], num_entities: int, tensor: torch.Tensor
    ) -> Tuple[torch.LongTensor, Dict[int, int]]:
        """
        Produces a tensor with shape ``(batch_size, num_entities)`` that encodes each entity's
        type. In addition, a map from a flattened entity index to type is returned to combine
        entity type operations into one method.

        Parameters
        ----------
        worlds : ``List[WikiTablesLanguage]``
        num_entities : ``int``
        tensor : ``torch.Tensor``
            Used for copying the constructed list onto the right device.

        Returns
        -------
        A ``torch.LongTensor`` with shape ``(batch_size, num_entities)``.
        entity_types : ``Dict[int, int]``
            This is a mapping from ((batch_index * num_entities) + entity_index) to entity type id.
        """
        entity_types = {}
        batch_types = []
        for batch_index, world in enumerate(worlds):
            types = []
            for entity_index, entity in enumerate(world.table_graph.entities):
                # We need numbers to be first, then date columns, then number columns, strings, and
                # string columns, in that order, because our entities are going to be sorted.  We do
                # a split by type and then a merge later, and it relies on this sorting.
                if entity.startswith("date_column:"):
                    entity_type = 1
                elif entity.startswith("number_column:"):
                    entity_type = 2
                elif entity.startswith("string:"):
                    entity_type = 3
                elif entity.startswith("string_column:"):
                    entity_type = 4
                else:
                    entity_type = 0
                types.append(entity_type)

                # For easier lookups later, we're actually using a _flattened_ version
                # of (batch_index, entity_index) for the key, because this is how the
                # linking scores are stored.
                flattened_entity_index = batch_index * num_entities + entity_index
                entity_types[flattened_entity_index] = entity_type
            padded = pad_sequence_to_length(types, num_entities, lambda: 0)
            batch_types.append(padded)
        return tensor.new_tensor(batch_types, dtype=torch.long), entity_types

Python allennlp.common.util.pad_sequence_to_length() Examples