Python allennlp.common.util.pad_sequence_to_length() Examples
The following are 30
code examples of allennlp.common.util.pad_sequence_to_length().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.common.util
, or try the search function
.
Example #1
Source File: token_indexer.py From allennlp with Apache License 2.0 | 6 votes |
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int] ) -> Dict[str, torch.Tensor]: """ This method pads a list of tokens given the input padding lengths (which could actually truncate things, depending on settings) and returns that padded list of input tokens as a `Dict[str, torch.Tensor]`. This is a dictionary because there should be one key per argument that the `TokenEmbedder` corresponding to this class expects in its `forward()` method (where the argument name in the `TokenEmbedder` needs to make the key in this dictionary). The base class implements the case when all you want to do is create a padded `LongTensor` for every list in the `tokens` dictionary. If your `TokenIndexer` needs more complex logic than that, you need to override this method. """ tensor_dict = {} for key, val in tokens.items(): if val and isinstance(val[0], bool): tensor = torch.BoolTensor( pad_sequence_to_length(val, padding_lengths[key], default_value=lambda: False) ) else: tensor = torch.LongTensor(pad_sequence_to_length(val, padding_lengths[key])) tensor_dict[key] = tensor return tensor_dict
Example #2
Source File: knowledge_graph_field.py From allennlp-semparse with Apache License 2.0 | 6 votes |
def as_tensor(self, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: text_tensors = self._entity_text_field.as_tensor(padding_lengths) padded_linking_features = util.pad_sequence_to_length( self.linking_features, padding_lengths["num_entities"], default_value=lambda: [] ) padded_linking_arrays = [] def default_feature_value(): return [0.0] * len(self._feature_extractors) for linking_features in padded_linking_features: padded_features = util.pad_sequence_to_length( linking_features, padding_lengths["num_utterance_tokens"], default_value=default_feature_value, ) padded_linking_arrays.append(padded_features) linking_features_tensor = torch.FloatTensor(padded_linking_arrays) return {"text": text_tensors, "linking": linking_features_tensor}
Example #3
Source File: list_field.py From magnitude with MIT License | 6 votes |
def as_tensor(self, padding_lengths , cuda_device = -1) : padded_field_list = pad_sequence_to_length(self.field_list, padding_lengths[u'num_fields'], self.field_list[0].empty_field) # Here we're removing the scoping on the padding length keys that we added in # `get_padding_lengths`; see the note there for more detail. child_padding_lengths = dict((key.replace(u'list_', u'', 1), value) for key, value in list(padding_lengths.items()) if key.startswith(u'list_')) padded_fields = [field.as_tensor(child_padding_lengths, cuda_device) for field in padded_field_list] return self.field_list[0].batch_tensors(padded_fields) #overrides
Example #4
Source File: knowledge_graph_field.py From stog with MIT License | 5 votes |
def as_tensor(self, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: tensors = {} desired_num_entities = padding_lengths['num_entities'] desired_num_entity_tokens = padding_lengths['num_entity_tokens'] desired_num_utterance_tokens = padding_lengths['num_utterance_tokens'] for indexer_name, indexer in self._token_indexers.items(): padded_entities = util.pad_sequence_to_length(self._indexed_entity_texts[indexer_name], desired_num_entities, default_value=lambda: []) padded_arrays = [] for padded_entity in padded_entities: padded_array = indexer.pad_token_sequence({'key': padded_entity}, {'key': desired_num_entity_tokens}, padding_lengths)['key'] padded_arrays.append(padded_array) tensor = torch.LongTensor(padded_arrays) tensors[indexer_name] = tensor padded_linking_features = util.pad_sequence_to_length(self.linking_features, desired_num_entities, default_value=lambda: []) padded_linking_arrays = [] default_feature_value = lambda: [0.0] * len(self._feature_extractors) for linking_features in padded_linking_features: padded_features = util.pad_sequence_to_length(linking_features, desired_num_utterance_tokens, default_value=default_feature_value) padded_linking_arrays.append(padded_features) linking_features_tensor = torch.FloatTensor(padded_linking_arrays) return {'text': tensors, 'linking': linking_features_tensor}
Example #5
Source File: byte_pair_indexer.py From DISTRE with Apache License 2.0 | 5 votes |
def pad_token_sequence(self, tokens: Dict[str, List[int]], desired_num_tokens: Dict[str, int], padding_lengths: Dict[str, int]) -> Dict[str, List[int]]: # pylint: disable=unused-argument return {key: pad_sequence_to_length(val, desired_num_tokens[key]) for key, val in tokens.items()}
Example #6
Source File: tiny_single_id.py From optuna with MIT License | 5 votes |
def as_padded_tensor( self, tokens: Dict[str, List[int]], desired_num_tokens: Dict[str, int], padding_lengths: Dict[str, int], ) -> Dict[str, torch.Tensor]: return { key: torch.LongTensor(pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in tokens.items() }
Example #7
Source File: fasttext_token_indexer.py From sigir19-neural-ir with Apache License 2.0 | 5 votes |
def pad_token_sequence(self, tokens: Dict[str, List[List[int]]], desired_num_tokens: Dict[str, int], padding_lengths: Dict[str, int]) -> Dict[str, List[List[int]]]: # pylint: disable=unused-argument return {key: torch.stack(pad_sequence_to_length(val, desired_num_tokens[key], default_value=self._default_value_for_padding)).long() for key, val in tokens.items()}
Example #8
Source File: knowledge_graph_field.py From gtos with MIT License | 5 votes |
def as_tensor(self, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: tensors = {} desired_num_entities = padding_lengths['num_entities'] desired_num_entity_tokens = padding_lengths['num_entity_tokens'] desired_num_utterance_tokens = padding_lengths['num_utterance_tokens'] for indexer_name, indexer in self._token_indexers.items(): padded_entities = util.pad_sequence_to_length(self._indexed_entity_texts[indexer_name], desired_num_entities, default_value=lambda: []) padded_arrays = [] for padded_entity in padded_entities: padded_array = indexer.pad_token_sequence({'key': padded_entity}, {'key': desired_num_entity_tokens}, padding_lengths)['key'] padded_arrays.append(padded_array) tensor = torch.LongTensor(padded_arrays) tensors[indexer_name] = tensor padded_linking_features = util.pad_sequence_to_length(self.linking_features, desired_num_entities, default_value=lambda: []) padded_linking_arrays = [] default_feature_value = lambda: [0.0] * len(self._feature_extractors) for linking_features in padded_linking_features: padded_features = util.pad_sequence_to_length(linking_features, desired_num_utterance_tokens, default_value=default_feature_value) padded_linking_arrays.append(padded_features) linking_features_tensor = torch.FloatTensor(padded_linking_arrays) return {'text': tensors, 'linking': linking_features_tensor}
Example #9
Source File: copy_map_field.py From nlp-models with MIT License | 5 votes |
def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor: desired_length = padding_lengths["num_tokens"] padded_tokens = pad_sequence_to_length(self._mapping_array, desired_length) tensor = torch.LongTensor(padded_tokens) return tensor
Example #10
Source File: test_util.py From magnitude with MIT License | 5 votes |
def test_pad_sequence_to_length(self): assert util.pad_sequence_to_length([1, 2, 3], 5) == [1, 2, 3, 0, 0] assert util.pad_sequence_to_length([1, 2, 3], 5, default_value=lambda: 2) == [1, 2, 3, 2, 2] assert util.pad_sequence_to_length([1, 2, 3], 5, padding_on_right=False) == [0, 0, 1, 2, 3]
Example #11
Source File: text_field_test.py From magnitude with MIT License | 5 votes |
def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items()))
Example #12
Source File: sequence_label_field.py From magnitude with MIT License | 5 votes |
def as_tensor(self, padding_lengths , cuda_device = -1) : desired_num_tokens = padding_lengths[u'num_tokens'] padded_tags = pad_sequence_to_length(self._indexed_labels, desired_num_tokens) tensor = torch.LongTensor(padded_tags) return tensor if cuda_device == -1 else tensor.cuda(cuda_device) #overrides
Example #13
Source File: knowledge_graph_field.py From magnitude with MIT License | 5 votes |
def as_tensor(self, padding_lengths , cuda_device = -1) : tensors = {} desired_num_entities = padding_lengths[u'num_entities'] desired_num_entity_tokens = padding_lengths[u'num_entity_tokens'] desired_num_utterance_tokens = padding_lengths[u'num_utterance_tokens'] for indexer_name, indexer in list(self._token_indexers.items()): padded_entities = util.pad_sequence_to_length(self._indexed_entity_texts[indexer_name], desired_num_entities, default_value=lambda: []) padded_arrays = [] for padded_entity in padded_entities: padded_array = indexer.pad_token_sequence({u'key': padded_entity}, {u'key': desired_num_entity_tokens}, padding_lengths)[u'key'] padded_arrays.append(padded_array) tensor = torch.LongTensor(padded_arrays) tensors[indexer_name] = tensor if cuda_device == -1 else tensor.cuda(cuda_device) padded_linking_features = util.pad_sequence_to_length(self.linking_features, desired_num_entities, default_value=lambda: []) padded_linking_arrays = [] default_feature_value = lambda: [0.0] * len(self._feature_extractors) for linking_features in padded_linking_features: padded_features = util.pad_sequence_to_length(linking_features, desired_num_utterance_tokens, default_value=default_feature_value) padded_linking_arrays.append(padded_features) linking_features_tensor = torch.FloatTensor(padded_linking_arrays) if cuda_device != -1: linking_features_tensor = linking_features_tensor.cuda(cuda_device) return {u'text': tensors, u'linking': linking_features_tensor}
Example #14
Source File: token_characters_indexer.py From magnitude with MIT License | 5 votes |
def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # Pad the tokens. # tokens has only one key... key = list(tokens.keys())[0] padded_tokens = pad_sequence_to_length( tokens[key], desired_num_tokens[key], default_value=self.get_padding_token ) # Pad the characters within the tokens. desired_token_length = padding_lengths[u'num_token_characters'] longest_token = max(tokens[key], key=len, default=[]) padding_value = 0 if desired_token_length > len(longest_token): # Since we want to pad to greater than the longest token, we add a # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest. padded_tokens.append([padding_value] * desired_token_length) # pad the list of lists to the longest sublist, appending 0's padded_tokens = list(izip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value))) if desired_token_length > len(longest_token): # Removes the "dummy token". padded_tokens.pop() # Truncates all the tokens to the desired length, and return the result. return {key: [list(token[:desired_token_length]) for token in padded_tokens]}
Example #15
Source File: features_field.py From scitail with Apache License 2.0 | 5 votes |
def as_array(self, padding_lengths: Dict[str, int]) -> numpy.array: padded_features = pad_sequence_to_length(self.features, padding_lengths['num_features'], (lambda: math.nan)) return numpy.asarray(padded_features, dtype=numpy.float32)
Example #16
Source File: wordpiece_indexer.py From NLP_Toolkit with Apache License 2.0 | 5 votes |
def pad_token_sequence(self, tokens: Dict[str, List[int]], desired_num_tokens: Dict[str, int], padding_lengths: Dict[str, int]) -> Dict[str, List[int]]: # pylint: disable=unused-argument return {key: pad_sequence_to_length(val, desired_num_tokens[key]) for key, val in tokens.items()}
Example #17
Source File: token_characters_indexer.py From allennlp with Apache License 2.0 | 5 votes |
def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int] ) -> Dict[str, torch.Tensor]: # Pad the tokens. padded_tokens = pad_sequence_to_length( tokens["token_characters"], padding_lengths["token_characters"], default_value=lambda: [], ) # Pad the characters within the tokens. desired_token_length = padding_lengths["num_token_characters"] longest_token: List[int] = max(tokens["token_characters"], key=len, default=[]) # type: ignore padding_value = 0 if desired_token_length > len(longest_token): # Since we want to pad to greater than the longest token, we add a # "dummy token" so we can take advantage of the fast implementation of itertools.zip_longest. padded_tokens.append([padding_value] * desired_token_length) # pad the list of lists to the longest sublist, appending 0's padded_tokens = list(zip(*itertools.zip_longest(*padded_tokens, fillvalue=padding_value))) if desired_token_length > len(longest_token): # Removes the "dummy token". padded_tokens.pop() # Truncates all the tokens to the desired length, and return the result. return { "token_characters": torch.LongTensor( [list(token[:desired_token_length]) for token in padded_tokens] ) }
Example #18
Source File: namespace_swapping_field.py From allennlp with Apache License 2.0 | 5 votes |
def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor: desired_length = padding_lengths["num_tokens"] padded_tokens = pad_sequence_to_length(self._mapping_array, desired_length) tensor = torch.LongTensor(padded_tokens) return tensor
Example #19
Source File: list_field.py From allennlp with Apache License 2.0 | 5 votes |
def as_tensor(self, padding_lengths: Dict[str, int]) -> DataArray: padded_field_list = pad_sequence_to_length( self.field_list, padding_lengths["num_fields"], self.field_list[0].empty_field ) # Here we're removing the scoping on the padding length keys that we added in # `get_padding_lengths`; see the note there for more detail. child_padding_lengths = { key.replace("list_", "", 1): value for key, value in padding_lengths.items() if key.startswith("list_") } padded_fields = [field.as_tensor(child_padding_lengths) for field in padded_field_list] return self.field_list[0].batch_tensors(padded_fields)
Example #20
Source File: sequence_label_field.py From allennlp with Apache License 2.0 | 5 votes |
def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor: desired_num_tokens = padding_lengths["num_tokens"] padded_tags = pad_sequence_to_length(self._indexed_labels, desired_num_tokens) tensor = torch.LongTensor(padded_tags) return tensor
Example #21
Source File: dep_label_indexer.py From magnitude with MIT License | 5 votes |
def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items()))
Example #22
Source File: bert_pretrained.py From udify with MIT License | 5 votes |
def as_padded_tensor( self, tokens: Dict[str, List[int]], desired_num_tokens: Dict[str, int], padding_lengths: Dict[str, int], ) -> Dict[str, torch.Tensor]: return { key: torch.LongTensor(pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in tokens.items() }
Example #23
Source File: elmo_indexer.py From magnitude with MIT License | 5 votes |
def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key], default_value=self._default_value_for_padding)) for key, val in list(tokens.items()))
Example #24
Source File: ner_tag_indexer.py From magnitude with MIT License | 5 votes |
def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items()))
Example #25
Source File: openai_transformer_byte_pair_indexer.py From magnitude with MIT License | 5 votes |
def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in tokens.items())
Example #26
Source File: pos_tag_indexer.py From magnitude with MIT License | 5 votes |
def pad_token_sequence(self, tokens , desired_num_tokens , padding_lengths ) : # pylint: disable=unused-argument return dict((key, pad_sequence_to_length(val, desired_num_tokens[key])) for key, val in list(tokens.items()))
Example #27
Source File: wikitables_semantic_parser.py From magnitude with MIT License | 4 votes |
def _get_type_vector(worlds , num_entities , tensor ) : u""" Produces the one hot encoding for each entity's type. In addition, a map from a flattened entity index to type is returned to combine entity type operations into one method. Parameters ---------- worlds : ``List[WikiTablesWorld]`` num_entities : ``int`` tensor : ``torch.Tensor`` Used for copying the constructed list onto the right device. Returns ------- A ``torch.LongTensor`` with shape ``(batch_size, num_entities, num_types)``. entity_types : ``Dict[int, int]`` This is a mapping from ((batch_index * num_entities) + entity_index) to entity type id. """ entity_types = {} batch_types = [] for batch_index, world in enumerate(worlds): types = [] for entity_index, entity in enumerate(world.table_graph.entities): one_hot_vectors = [[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]] # We need numbers to be first, then cells, then parts, then row, because our # entities are going to be sorted. We do a split by type and then a merge later, # and it relies on this sorting. if entity.startswith(u'fb:cell'): entity_type = 1 elif entity.startswith(u'fb:part'): entity_type = 2 elif entity.startswith(u'fb:row'): entity_type = 3 else: entity_type = 0 types.append(one_hot_vectors[entity_type]) # For easier lookups later, we're actually using a _flattened_ version # of (batch_index, entity_index) for the key, because this is how the # linking scores are stored. flattened_entity_index = batch_index * num_entities + entity_index entity_types[flattened_entity_index] = entity_type padded = pad_sequence_to_length(types, num_entities, lambda: [0, 0, 0, 0]) batch_types.append(padded) return tensor.new_tensor(batch_types), entity_types
Example #28
Source File: atis_semantic_parser.py From allennlp-semparse with Apache License 2.0 | 4 votes |
def _get_type_vector( worlds: List[AtisWorld], num_entities: int, tensor: torch.Tensor = None ) -> Tuple[torch.LongTensor, Dict[int, int]]: """ Produces the encoding for each entity's type. In addition, a map from a flattened entity index to type is returned to combine entity type operations into one method. Parameters ---------- worlds : ``List[AtisWorld]`` num_entities : ``int`` tensor : ``torch.Tensor`` Used for copying the constructed list onto the right device. Returns ------- A ``torch.LongTensor`` with shape ``(batch_size, num_entities, num_types)``. entity_types : ``Dict[int, int]`` This is a mapping from ((batch_index * num_entities) + entity_index) to entity type id. """ entity_types = {} batch_types = [] for batch_index, world in enumerate(worlds): types = [] entities = [ ("number", entity) if any( [ entity.startswith(numeric_nonterminal) for numeric_nonterminal in NUMERIC_NONTERMINALS ] ) else ("string", entity) for entity in world.entities ] for entity_index, entity in enumerate(entities): # We need numbers to be first, then strings, since our entities are going to be # sorted. We do a split by type and then a merge later, and it relies on this sorting. if entity[0] == "number": entity_type = 1 else: entity_type = 0 types.append(entity_type) # For easier lookups later, we're actually using a _flattened_ version # of (batch_index, entity_index) for the key, because this is how the # linking scores are stored. flattened_entity_index = batch_index * num_entities + entity_index entity_types[flattened_entity_index] = entity_type padded = pad_sequence_to_length(types, num_entities, lambda: 0) batch_types.append(padded) return tensor.new_tensor(batch_types, dtype=torch.long), entity_types
Example #29
Source File: wikitables_semantic_parser.py From allennlp-semparse with Apache License 2.0 | 4 votes |
def _get_neighbor_indices( worlds: List[WikiTablesLanguage], num_entities: int, tensor: torch.Tensor ) -> torch.LongTensor: """ This method returns the indices of each entity's neighbors. A tensor is accepted as a parameter for copying purposes. Parameters ---------- worlds : ``List[WikiTablesLanguage]`` num_entities : ``int`` tensor : ``torch.Tensor`` Used for copying the constructed list onto the right device. Returns ------- A ``torch.LongTensor`` with shape ``(batch_size, num_entities, num_neighbors)``. It is padded with -1 instead of 0, since 0 is a valid neighbor index. If all the entities in the batch have no neighbors, None will be returned. """ num_neighbors = 0 for world in worlds: for entity in world.table_graph.entities: if len(world.table_graph.neighbors[entity]) > num_neighbors: num_neighbors = len(world.table_graph.neighbors[entity]) batch_neighbors = [] no_entities_have_neighbors = True for world in worlds: # Each batch instance has its own world, which has a corresponding table. entities = world.table_graph.entities entity2index = {entity: i for i, entity in enumerate(entities)} entity2neighbors = world.table_graph.neighbors neighbor_indexes = [] for entity in entities: entity_neighbors = [entity2index[n] for n in entity2neighbors[entity]] if entity_neighbors: no_entities_have_neighbors = False # Pad with -1 instead of 0, since 0 represents a neighbor index. padded = pad_sequence_to_length(entity_neighbors, num_neighbors, lambda: -1) neighbor_indexes.append(padded) neighbor_indexes = pad_sequence_to_length( neighbor_indexes, num_entities, lambda: [-1] * num_neighbors ) batch_neighbors.append(neighbor_indexes) # It is possible that none of the entities has any neighbors, since our definition of the # knowledge graph allows it when no entities or numbers were extracted from the question. if no_entities_have_neighbors: return None return tensor.new_tensor(batch_neighbors, dtype=torch.long)
Example #30
Source File: wikitables_semantic_parser.py From allennlp-semparse with Apache License 2.0 | 4 votes |
def _get_type_vector( worlds: List[WikiTablesLanguage], num_entities: int, tensor: torch.Tensor ) -> Tuple[torch.LongTensor, Dict[int, int]]: """ Produces a tensor with shape ``(batch_size, num_entities)`` that encodes each entity's type. In addition, a map from a flattened entity index to type is returned to combine entity type operations into one method. Parameters ---------- worlds : ``List[WikiTablesLanguage]`` num_entities : ``int`` tensor : ``torch.Tensor`` Used for copying the constructed list onto the right device. Returns ------- A ``torch.LongTensor`` with shape ``(batch_size, num_entities)``. entity_types : ``Dict[int, int]`` This is a mapping from ((batch_index * num_entities) + entity_index) to entity type id. """ entity_types = {} batch_types = [] for batch_index, world in enumerate(worlds): types = [] for entity_index, entity in enumerate(world.table_graph.entities): # We need numbers to be first, then date columns, then number columns, strings, and # string columns, in that order, because our entities are going to be sorted. We do # a split by type and then a merge later, and it relies on this sorting. if entity.startswith("date_column:"): entity_type = 1 elif entity.startswith("number_column:"): entity_type = 2 elif entity.startswith("string:"): entity_type = 3 elif entity.startswith("string_column:"): entity_type = 4 else: entity_type = 0 types.append(entity_type) # For easier lookups later, we're actually using a _flattened_ version # of (batch_index, entity_index) for the key, because this is how the # linking scores are stored. flattened_entity_index = batch_index * num_entities + entity_index entity_types[flattened_entity_index] = entity_type padded = pad_sequence_to_length(types, num_entities, lambda: 0) batch_types.append(padded) return tensor.new_tensor(batch_types, dtype=torch.long), entity_types