Python allennlp.common.util.pad_sequence_to_length() Examples

The following are 30 code examples of allennlp.common.util.pad_sequence_to_length(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module allennlp.common.util , or try the search function .
Example #1
Source File: token_indexer.py    From allennlp with Apache License 2.0 6 votes vote down vote up
def as_padded_tensor_dict(
        self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]
    ) -> Dict[str, torch.Tensor]:
        """
        This method pads a list of tokens given the input padding lengths (which could actually
        truncate things, depending on settings) and returns that padded list of input tokens as a
        `Dict[str, torch.Tensor]`.  This is a dictionary because there should be one key per
        argument that the `TokenEmbedder` corresponding to this class expects in its `forward()`
        method (where the argument name in the `TokenEmbedder` needs to make the key in this
        dictionary).

        The base class implements the case when all you want to do is create a padded `LongTensor`
        for every list in the `tokens` dictionary.  If your `TokenIndexer` needs more complex
        logic than that, you need to override this method.
        """
        tensor_dict = {}
        for key, val in tokens.items():
            if val and isinstance(val[0], bool):
                tensor = torch.BoolTensor(
                    pad_sequence_to_length(val, padding_lengths[key], default_value=lambda: False)
                )
            else:
                tensor = torch.LongTensor(pad_sequence_to_length(val, padding_lengths[key]))
            tensor_dict[key] = tensor
        return tensor_dict 
Example #2
Source File: knowledge_graph_field.py    From allennlp-semparse with Apache License 2.0 6 votes vote down vote up
def as_tensor(self, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        text_tensors = self._entity_text_field.as_tensor(padding_lengths)
        padded_linking_features = util.pad_sequence_to_length(
            self.linking_features, padding_lengths["num_entities"], default_value=lambda: []
        )
        padded_linking_arrays = []

        def default_feature_value():
            return [0.0] * len(self._feature_extractors)

        for linking_features in padded_linking_features:
            padded_features = util.pad_sequence_to_length(
                linking_features,
                padding_lengths["num_utterance_tokens"],
                default_value=default_feature_value,
            )
            padded_linking_arrays.append(padded_features)
        linking_features_tensor = torch.FloatTensor(padded_linking_arrays)
        return {"text": text_tensors, "linking": linking_features_tensor} 
Example #3
Source File: list_field.py    From magnitude with MIT License 6 votes vote down vote up
def as_tensor(self,
                  padding_lengths                ,
                  cuda_device      = -1)             :
        padded_field_list = pad_sequence_to_length(self.field_list,
                                                   padding_lengths[u'num_fields'],
                                                   self.field_list[0].empty_field)
        # Here we're removing the scoping on the padding length keys that we added in
        # `get_padding_lengths`; see the note there for more detail.
        child_padding_lengths = dict((key.replace(u'list_', u'', 1), value)
                                 for key, value in list(padding_lengths.items())
                                 if key.startswith(u'list_'))
        padded_fields = [field.as_tensor(child_padding_lengths, cuda_device)
                         for field in padded_field_list]
        return self.field_list[0].batch_tensors(padded_fields)

    #overrides 
Example #4
Source File: knowledge_graph_field.py    From stog with MIT License 5 votes vote down vote up
def as_tensor(self, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tensors = {}
        desired_num_entities = padding_lengths['num_entities']
        desired_num_entity_tokens = padding_lengths['num_entity_tokens']
        desired_num_utterance_tokens = padding_lengths['num_utterance_tokens']
        for indexer_name, indexer in self._token_indexers.items():
            padded_entities = util.pad_sequence_to_length(self._indexed_entity_texts[indexer_name],
                                                          desired_num_entities,
                                                          default_value=lambda: [])
            padded_arrays = []
            for padded_entity in padded_entities:
                padded_array = indexer.pad_token_sequence({'key': padded_entity},
                                                          {'key': desired_num_entity_tokens},
                                                          padding_lengths)['key']
                padded_arrays.append(padded_array)
            tensor = torch.LongTensor(padded_arrays)
            tensors[indexer_name] = tensor
        padded_linking_features = util.pad_sequence_to_length(self.linking_features,
                                                              desired_num_entities,
                                                              default_value=lambda: [])
        padded_linking_arrays = []
        default_feature_value = lambda: [0.0] * len(self._feature_extractors)
        for linking_features in padded_linking_features:
            padded_features = util.pad_sequence_to_length(linking_features,
                                                          desired_num_utterance_tokens,
                                                          default_value=default_feature_value)
            padded_linking_arrays.append(padded_features)
        linking_features_tensor = torch.FloatTensor(padded_linking_arrays)
        return {'text': tensors, 'linking': linking_features_tensor} 
Example #5
Source File: byte_pair_indexer.py    From DISTRE with Apache License 2.0 5 votes vote down vote up
def pad_token_sequence(self,
                           tokens: Dict[str, List[int]],
                           desired_num_tokens: Dict[str, int],
                           padding_lengths: Dict[str, int]) -> Dict[str, List[int]]:  # pylint: disable=unused-argument
        return {key: pad_sequence_to_length(val, desired_num_tokens[key])
                for key, val in tokens.items()} 
Example #6
Source File: tiny_single_id.py    From optuna with MIT License 5 votes vote down vote up
def as_padded_tensor(
        self,
        tokens: Dict[str, List[int]],
        desired_num_tokens: Dict[str, int],
        padding_lengths: Dict[str, int],
    ) -> Dict[str, torch.Tensor]:
        return {
            key: torch.LongTensor(pad_sequence_to_length(val, desired_num_tokens[key]))
            for key, val in tokens.items()
        } 
Example #7
Source File: fasttext_token_indexer.py    From sigir19-neural-ir with Apache License 2.0 5 votes vote down vote up
def pad_token_sequence(self,
                           tokens: Dict[str, List[List[int]]],
                           desired_num_tokens: Dict[str, int],
                           padding_lengths: Dict[str, int]) -> Dict[str, List[List[int]]]:
        # pylint: disable=unused-argument
        return {key: torch.stack(pad_sequence_to_length(val, desired_num_tokens[key],
                                                        default_value=self._default_value_for_padding)).long()
                for key, val in tokens.items()} 
Example #8
Source File: knowledge_graph_field.py    From gtos with MIT License 5 votes vote down vote up
def as_tensor(self, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tensors = {}
        desired_num_entities = padding_lengths['num_entities']
        desired_num_entity_tokens = padding_lengths['num_entity_tokens']
        desired_num_utterance_tokens = padding_lengths['num_utterance_tokens']
        for indexer_name, indexer in self._token_indexers.items():
            padded_entities = util.pad_sequence_to_length(self._indexed_entity_texts[indexer_name],
                                                          desired_num_entities,
                                                          default_value=lambda: [])
            padded_arrays = []
            for padded_entity in padded_entities:
                padded_array = indexer.pad_token_sequence({'key': padded_entity},
                                                          {'key': desired_num_entity_tokens},
                                                          padding_lengths)['key']
                padded_arrays.append(padded_array)
            tensor = torch.LongTensor(padded_arrays)
            tensors[indexer_name] = tensor
        padded_linking_features = util.pad_sequence_to_length(self.linking_features,
                                                              desired_num_entities,
                                                              default_value=lambda: [])
        padded_linking_arrays = []
        default_feature_value = lambda: [0.0] * len(self._feature_extractors)
        for linking_features in padded_linking_features:
            padded_features = util.pad_sequence_to_length(linking_features,
                                                          desired_num_utterance_tokens,
                                                          default_value=default_feature_value)
            padded_linking_arrays.append(padded_features)
        linking_features_tensor = torch.FloatTensor(padded_linking_arrays)
        return {'text': tensors, 'linking': linking_features_tensor} 
Example #9
Source File: copy_map_field.py    From nlp-models with MIT License 5 votes vote down vote up
def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
        desired_length = padding_lengths["num_tokens"]
        padded_tokens = pad_sequence_to_length(self._mapping_array, desired_length)
        tensor = torch.LongTensor(padded_tokens)
        return tensor 
Example #10
Source File: test_util.py    From magnitude with MIT License 5 votes vote down vote up
def test_pad_sequence_to_length(self):
        assert util.pad_sequence_to_length([1, 2, 3], 5) == [1, 2, 3, 0, 0]
        assert util.pad_sequence_to_length([1, 2, 3], 5, default_value=lambda: 2) == [1, 2, 3, 2, 2]
        assert util.pad_sequence_to_length([1, 2, 3], 5, padding_on_right=False) == [0, 0, 1, 2, 3] 
Example #11
Source File: text_field_test.py    From magnitude with MIT License 5 votes vote down vote up
def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items())) 
Example #12
Source File: sequence_label_field.py    From magnitude with MIT License 5 votes vote down vote up
def as_tensor(self,
                  padding_lengths                ,
                  cuda_device      = -1)                :
        desired_num_tokens = padding_lengths[u'num_tokens']
        padded_tags = pad_sequence_to_length(self._indexed_labels, desired_num_tokens)
        tensor = torch.LongTensor(padded_tags)
        return tensor if cuda_device == -1 else tensor.cuda(cuda_device)

    #overrides 
Example #13
Source File: knowledge_graph_field.py    From magnitude with MIT License 5 votes vote down vote up
def as_tensor(self,
                  padding_lengths                ,
                  cuda_device      = -1)                           :
        tensors = {}
        desired_num_entities = padding_lengths[u'num_entities']
        desired_num_entity_tokens = padding_lengths[u'num_entity_tokens']
        desired_num_utterance_tokens = padding_lengths[u'num_utterance_tokens']
        for indexer_name, indexer in list(self._token_indexers.items()):
            padded_entities = util.pad_sequence_to_length(self._indexed_entity_texts[indexer_name],
                                                          desired_num_entities,
                                                          default_value=lambda: [])
            padded_arrays = []
            for padded_entity in padded_entities:
                padded_array = indexer.pad_token_sequence({u'key': padded_entity},
                                                          {u'key': desired_num_entity_tokens},
                                                          padding_lengths)[u'key']
                padded_arrays.append(padded_array)
            tensor = torch.LongTensor(padded_arrays)
            tensors[indexer_name] = tensor if cuda_device == -1 else tensor.cuda(cuda_device)
        padded_linking_features = util.pad_sequence_to_length(self.linking_features,
                                                              desired_num_entities,
                                                              default_value=lambda: [])
        padded_linking_arrays = []
        default_feature_value = lambda: [0.0] * len(self._feature_extractors)
        for linking_features in padded_linking_features:
            padded_features = util.pad_sequence_to_length(linking_features,
                                                          desired_num_utterance_tokens,
                                                          default_value=default_feature_value)
            padded_linking_arrays.append(padded_features)
        linking_features_tensor = torch.FloatTensor(padded_linking_arrays)
        if cuda_device != -1:
            linking_features_tensor = linking_features_tensor.cuda(cuda_device)
        return {u'text': tensors, u'linking': linking_features_tensor} 
Example #14
Source File: token_characters_indexer.py    From magnitude with MIT License 5 votes vote down vote up
def pad_token_sequence(self,
                           tokens                            ,
                           desired_num_tokens                ,
                           padding_lengths                )                              :
        # Pad the tokens.
        # tokens has only one key...
        key = list(tokens.keys())[0]

        padded_tokens = pad_sequence_to_length(
                tokens[key], desired_num_tokens[key],
                default_value=self.get_padding_token
        )

        # Pad the characters within the tokens.
        desired_token_length = padding_lengths[u'num_token_characters']
        longest_token            = max(tokens[key], key=len, default=[])
        padding_value = 0
        if desired_token_length > len(longest_token):
            # Since we want to pad to greater than the longest token, we add a
            # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest.
            padded_tokens.append([padding_value] * desired_token_length)
        # pad the list of lists to the longest sublist, appending 0's
        padded_tokens = list(izip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value)))
        if desired_token_length > len(longest_token):
            # Removes the "dummy token".
            padded_tokens.pop()
        # Truncates all the tokens to the desired length, and return the result.
        return {key: [list(token[:desired_token_length]) for token in padded_tokens]} 
Example #15
Source File: features_field.py    From scitail with Apache License 2.0 5 votes vote down vote up
def as_array(self, padding_lengths: Dict[str, int]) -> numpy.array:
        padded_features = pad_sequence_to_length(self.features,
                                                 padding_lengths['num_features'],
                                                 (lambda: math.nan))
        return numpy.asarray(padded_features, dtype=numpy.float32) 
Example #16
Source File: wordpiece_indexer.py    From NLP_Toolkit with Apache License 2.0 5 votes vote down vote up
def pad_token_sequence(self,
                           tokens: Dict[str, List[int]],
                           desired_num_tokens: Dict[str, int],
                           padding_lengths: Dict[str, int]) -> Dict[str, List[int]]:  # pylint: disable=unused-argument
        return {key: pad_sequence_to_length(val, desired_num_tokens[key])
                for key, val in tokens.items()} 
Example #17
Source File: token_characters_indexer.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def as_padded_tensor_dict(
        self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]
    ) -> Dict[str, torch.Tensor]:
        # Pad the tokens.
        padded_tokens = pad_sequence_to_length(
            tokens["token_characters"],
            padding_lengths["token_characters"],
            default_value=lambda: [],
        )

        # Pad the characters within the tokens.
        desired_token_length = padding_lengths["num_token_characters"]
        longest_token: List[int] = max(tokens["token_characters"], key=len, default=[])  # type: ignore
        padding_value = 0
        if desired_token_length > len(longest_token):
            # Since we want to pad to greater than the longest token, we add a
            # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest.
            padded_tokens.append([padding_value] * desired_token_length)
        # pad the list of lists to the longest sublist, appending 0's
        padded_tokens = list(zip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value)))
        if desired_token_length > len(longest_token):
            # Removes the "dummy token".
            padded_tokens.pop()
        # Truncates all the tokens to the desired length, and return the result.
        return {
            "token_characters": torch.LongTensor(
                [list(token[:desired_token_length]) for token in padded_tokens]
            )
        } 
Example #18
Source File: namespace_swapping_field.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
        desired_length = padding_lengths["num_tokens"]
        padded_tokens = pad_sequence_to_length(self._mapping_array, desired_length)
        tensor = torch.LongTensor(padded_tokens)
        return tensor 
Example #19
Source File: list_field.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def as_tensor(self, padding_lengths: Dict[str, int]) -> DataArray:
        padded_field_list = pad_sequence_to_length(
            self.field_list, padding_lengths["num_fields"], self.field_list[0].empty_field
        )
        # Here we're removing the scoping on the padding length keys that we added in
        # `get_padding_lengths`; see the note there for more detail.
        child_padding_lengths = {
            key.replace("list_", "", 1): value
            for key, value in padding_lengths.items()
            if key.startswith("list_")
        }
        padded_fields = [field.as_tensor(child_padding_lengths) for field in padded_field_list]
        return self.field_list[0].batch_tensors(padded_fields) 
Example #20
Source File: sequence_label_field.py    From allennlp with Apache License 2.0 5 votes vote down vote up
def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
        desired_num_tokens = padding_lengths["num_tokens"]
        padded_tags = pad_sequence_to_length(self._indexed_labels, desired_num_tokens)
        tensor = torch.LongTensor(padded_tags)
        return tensor 
Example #21
Source File: dep_label_indexer.py    From magnitude with MIT License 5 votes vote down vote up
def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in list(tokens.items())) 
Example #22
Source File: bert_pretrained.py    From udify with MIT License 5 votes vote down vote up
def as_padded_tensor(
            self,
            tokens: Dict[str, List[int]],
            desired_num_tokens: Dict[str, int],
            padding_lengths: Dict[str, int],
    ) -> Dict[str, torch.Tensor]:
        return {
            key: torch.LongTensor(pad_sequence_to_length(val, desired_num_tokens[key]))
            for key, val in tokens.items()
        } 
Example #23
Source File: elmo_indexer.py    From magnitude with MIT License 5 votes vote down vote up
def pad_token_sequence(self,
                           tokens                            ,
                           desired_num_tokens                ,
                           padding_lengths                )                              :
        # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key],
                                            default_value=self._default_value_for_padding))
                for key, val in list(tokens.items())) 
Example #24
Source File: ner_tag_indexer.py    From magnitude with MIT License 5 votes vote down vote up
def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in list(tokens.items())) 
Example #25
Source File: openai_transformer_byte_pair_indexer.py    From magnitude with MIT License 5 votes vote down vote up
def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in tokens.items()) 
Example #26
Source File: pos_tag_indexer.py    From magnitude with MIT License 5 votes vote down vote up
def pad_token_sequence(self,
                           tokens                      ,
                           desired_num_tokens                ,
                           padding_lengths                )                        :  # pylint: disable=unused-argument
        return dict((key, pad_sequence_to_length(val, desired_num_tokens[key]))
                for key, val in list(tokens.items())) 
Example #27
Source File: wikitables_semantic_parser.py    From magnitude with MIT License 4 votes vote down vote up
def _get_type_vector(worlds                       ,
                         num_entities     ,
                         tensor              )                                           :
        u"""
        Produces the one hot encoding for each entity's type. In addition,
        a map from a flattened entity index to type is returned to combine
        entity type operations into one method.

        Parameters
        ----------
        worlds : ``List[WikiTablesWorld]``
        num_entities : ``int``
        tensor : ``torch.Tensor``
            Used for copying the constructed list onto the right device.

        Returns
        -------
        A ``torch.LongTensor`` with shape ``(batch_size, num_entities, num_types)``.
        entity_types : ``Dict[int, int]``
            This is a mapping from ((batch_index * num_entities) + entity_index) to entity type id.
        """
        entity_types = {}
        batch_types = []
        for batch_index, world in enumerate(worlds):
            types = []
            for entity_index, entity in enumerate(world.table_graph.entities):
                one_hot_vectors = [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]
                # We need numbers to be first, then cells, then parts, then row, because our
                # entities are going to be sorted.  We do a split by type and then a merge later,
                # and it relies on this sorting.
                if entity.startswith(u'fb:cell'):
                    entity_type = 1
                elif entity.startswith(u'fb:part'):
                    entity_type = 2
                elif entity.startswith(u'fb:row'):
                    entity_type = 3
                else:
                    entity_type = 0
                types.append(one_hot_vectors[entity_type])

                # For easier lookups later, we're actually using a _flattened_ version
                # of (batch_index, entity_index) for the key, because this is how the
                # linking scores are stored.
                flattened_entity_index = batch_index * num_entities + entity_index
                entity_types[flattened_entity_index] = entity_type
            padded = pad_sequence_to_length(types, num_entities, lambda: [0, 0, 0, 0])
            batch_types.append(padded)
        return tensor.new_tensor(batch_types), entity_types 
Example #28
Source File: atis_semantic_parser.py    From allennlp-semparse with Apache License 2.0 4 votes vote down vote up
def _get_type_vector(
        worlds: List[AtisWorld], num_entities: int, tensor: torch.Tensor = None
    ) -> Tuple[torch.LongTensor, Dict[int, int]]:
        """
        Produces the encoding for each entity's type. In addition, a map from a flattened entity
        index to type is returned to combine entity type operations into one method.

        Parameters
        ----------
        worlds : ``List[AtisWorld]``
        num_entities : ``int``
        tensor : ``torch.Tensor``
            Used for copying the constructed list onto the right device.

        Returns
        -------
        A ``torch.LongTensor`` with shape ``(batch_size, num_entities, num_types)``.
        entity_types : ``Dict[int, int]``
            This is a mapping from ((batch_index * num_entities) + entity_index) to entity type id.
        """
        entity_types = {}
        batch_types = []

        for batch_index, world in enumerate(worlds):
            types = []
            entities = [
                ("number", entity)
                if any(
                    [
                        entity.startswith(numeric_nonterminal)
                        for numeric_nonterminal in NUMERIC_NONTERMINALS
                    ]
                )
                else ("string", entity)
                for entity in world.entities
            ]

            for entity_index, entity in enumerate(entities):
                # We need numbers to be first, then strings, since our entities are going to be
                # sorted. We do a split by type and then a merge later, and it relies on this sorting.
                if entity[0] == "number":
                    entity_type = 1
                else:
                    entity_type = 0
                types.append(entity_type)

                # For easier lookups later, we're actually using a _flattened_ version
                # of (batch_index, entity_index) for the key, because this is how the
                # linking scores are stored.
                flattened_entity_index = batch_index * num_entities + entity_index
                entity_types[flattened_entity_index] = entity_type
            padded = pad_sequence_to_length(types, num_entities, lambda: 0)
            batch_types.append(padded)

        return tensor.new_tensor(batch_types, dtype=torch.long), entity_types 
Example #29
Source File: wikitables_semantic_parser.py    From allennlp-semparse with Apache License 2.0 4 votes vote down vote up
def _get_neighbor_indices(
        worlds: List[WikiTablesLanguage], num_entities: int, tensor: torch.Tensor
    ) -> torch.LongTensor:
        """
        This method returns the indices of each entity's neighbors. A tensor
        is accepted as a parameter for copying purposes.

        Parameters
        ----------
        worlds : ``List[WikiTablesLanguage]``
        num_entities : ``int``
        tensor : ``torch.Tensor``
            Used for copying the constructed list onto the right device.

        Returns
        -------
        A ``torch.LongTensor`` with shape ``(batch_size, num_entities, num_neighbors)``. It is padded
        with -1 instead of 0, since 0 is a valid neighbor index. If all the entities in the batch
        have no neighbors, None will be returned.
        """

        num_neighbors = 0
        for world in worlds:
            for entity in world.table_graph.entities:
                if len(world.table_graph.neighbors[entity]) > num_neighbors:
                    num_neighbors = len(world.table_graph.neighbors[entity])

        batch_neighbors = []
        no_entities_have_neighbors = True
        for world in worlds:
            # Each batch instance has its own world, which has a corresponding table.
            entities = world.table_graph.entities
            entity2index = {entity: i for i, entity in enumerate(entities)}
            entity2neighbors = world.table_graph.neighbors
            neighbor_indexes = []
            for entity in entities:
                entity_neighbors = [entity2index[n] for n in entity2neighbors[entity]]
                if entity_neighbors:
                    no_entities_have_neighbors = False
                # Pad with -1 instead of 0, since 0 represents a neighbor index.
                padded = pad_sequence_to_length(entity_neighbors, num_neighbors, lambda: -1)
                neighbor_indexes.append(padded)
            neighbor_indexes = pad_sequence_to_length(
                neighbor_indexes, num_entities, lambda: [-1] * num_neighbors
            )
            batch_neighbors.append(neighbor_indexes)
        # It is possible that none of the entities has any neighbors, since our definition of the
        # knowledge graph allows it when no entities or numbers were extracted from the question.
        if no_entities_have_neighbors:
            return None
        return tensor.new_tensor(batch_neighbors, dtype=torch.long) 
Example #30
Source File: wikitables_semantic_parser.py    From allennlp-semparse with Apache License 2.0 4 votes vote down vote up
def _get_type_vector(
        worlds: List[WikiTablesLanguage], num_entities: int, tensor: torch.Tensor
    ) -> Tuple[torch.LongTensor, Dict[int, int]]:
        """
        Produces a tensor with shape ``(batch_size, num_entities)`` that encodes each entity's
        type. In addition, a map from a flattened entity index to type is returned to combine
        entity type operations into one method.

        Parameters
        ----------
        worlds : ``List[WikiTablesLanguage]``
        num_entities : ``int``
        tensor : ``torch.Tensor``
            Used for copying the constructed list onto the right device.

        Returns
        -------
        A ``torch.LongTensor`` with shape ``(batch_size, num_entities)``.
        entity_types : ``Dict[int, int]``
            This is a mapping from ((batch_index * num_entities) + entity_index) to entity type id.
        """
        entity_types = {}
        batch_types = []
        for batch_index, world in enumerate(worlds):
            types = []
            for entity_index, entity in enumerate(world.table_graph.entities):
                # We need numbers to be first, then date columns, then number columns, strings, and
                # string columns, in that order, because our entities are going to be sorted.  We do
                # a split by type and then a merge later, and it relies on this sorting.
                if entity.startswith("date_column:"):
                    entity_type = 1
                elif entity.startswith("number_column:"):
                    entity_type = 2
                elif entity.startswith("string:"):
                    entity_type = 3
                elif entity.startswith("string_column:"):
                    entity_type = 4
                else:
                    entity_type = 0
                types.append(entity_type)

                # For easier lookups later, we're actually using a _flattened_ version
                # of (batch_index, entity_index) for the key, because this is how the
                # linking scores are stored.
                flattened_entity_index = batch_index * num_entities + entity_index
                entity_types[flattened_entity_index] = entity_type
            padded = pad_sequence_to_length(types, num_entities, lambda: 0)
            batch_types.append(padded)
        return tensor.new_tensor(batch_types, dtype=torch.long), entity_types