Python allennlp.nn.util.remove_sentence_boundaries() Examples
The following are 9
code examples of allennlp.nn.util.remove_sentence_boundaries().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
allennlp.nn.util
, or try the search function
.
Example #1
Source File: util_test.py From allennlp with Apache License 2.0 | 6 votes |
def test_remove_sentence_boundaries(self): tensor = torch.from_numpy(numpy.random.rand(3, 5, 7)) mask = torch.from_numpy( # The mask with two elements is to test the corner case # of an empty sequence, so here we are removing boundaries # from "<S> </S>" numpy.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]]) ).bool() new_tensor, new_mask = util.remove_sentence_boundaries(tensor, mask) expected_new_tensor = torch.zeros(3, 3, 7) expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :] expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :] assert_array_almost_equal(new_tensor.data.numpy(), expected_new_tensor.data.numpy()) expected_new_mask = torch.from_numpy(numpy.array([[0, 0, 0], [1, 1, 1], [1, 1, 0]])).bool() assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all()
Example #2
Source File: util_test.py From magnitude with MIT License | 6 votes |
def test_remove_sentence_boundaries(self): tensor = torch.from_numpy(numpy.random.rand(3, 5, 7)) mask = torch.from_numpy( # The mask with two elements is to test the corner case # of an empty sequence, so here we are removing boundaries # from "<S> </S>" numpy.array([[1, 1, 0, 0, 0], [1, 1, 1, 1, 1], [1, 1, 1, 1, 0]])).long() new_tensor, new_mask = util.remove_sentence_boundaries(tensor, mask) expected_new_tensor = torch.zeros(3, 3, 7) expected_new_tensor[1, 0:3, :] = tensor[1, 1:4, :] expected_new_tensor[2, 0:2, :] = tensor[2, 1:3, :] assert_array_almost_equal(new_tensor.data.numpy(), expected_new_tensor.data.numpy()) expected_new_mask = torch.from_numpy( numpy.array([[0, 0, 0], [1, 1, 1], [1, 1, 0]])).long() assert (new_mask.data.numpy() == expected_new_mask.data.numpy()).all()
Example #3
Source File: elmo_test.py From allennlp with Apache License 2.0 | 5 votes |
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.elmo_fixtures_path, "vocab_test.txt"), "r") as fin: words = fin.read().strip().split("\n") vocab = Vocabulary() indexer = ELMoTokenCharactersIndexer() tokens = [Token(word) for word in words] indices = indexer.tokens_to_indices(tokens, vocab) # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): char_indices = indices["elmo_tokens"][(k * 50) : ((k + 1) * 50)] sentences.append( indexer.as_padded_tensor_dict( {"elmo_tokens": char_indices}, padding_lengths={"elmo_tokens": 50} )["elmo_tokens"] ) batch = torch.stack(sentences) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output["token_embedding"], elmo_token_embedder_output["mask"] )[0].data.numpy() actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.elmo_fixtures_path, "elmo_token_embeddings.hdf5") with h5py.File(embedding_file, "r") as fin: expected_embeddings = fin["embedding"][...] assert numpy.allclose(actual_embeddings[: len(tokens)], expected_embeddings, atol=1e-6)
Example #4
Source File: elmo.py From magnitude with MIT License | 5 votes |
def batch_to_embeddings(self, batch ) : u""" Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. Returns ------- A tuple of tensors, the first representing activations (batch_size, 3, num_timesteps, 1024) and the second a mask (batch_size, num_timesteps). """ character_ids = batch_to_ids(batch) if self.cuda_device >= 0: character_ids = character_ids.cuda(device=self.cuda_device) bilm_output = self.elmo_bilm(character_ids) layer_activations = bilm_output[u'activations'] mask_with_bos_eos = bilm_output[u'mask'] # without_bos_eos is a 3 element list of (activation, mask) tensor pairs, # each with size (batch_size, num_timesteps, dim and (batch_size, num_timesteps) # respectively. without_bos_eos = [remove_sentence_boundaries(layer, mask_with_bos_eos) for layer in layer_activations] # Converts a list of pairs (activation, mask) tensors to a single tensor of activations. activations = torch.cat([ele[0].unsqueeze(1) for ele in without_bos_eos], dim=1) # The mask is the same for each ELMo vector, so just take the first. mask = without_bos_eos[0][1] return activations, mask
Example #5
Source File: elmo_test.py From magnitude with MIT License | 5 votes |
def test_elmo_token_representation(self): # Load the test words and convert to char ids with open(os.path.join(self.elmo_fixtures_path, u'vocab_test.txt'), u'r') as fin: words = fin.read().strip().split(u'\n') vocab = Vocabulary() indexer = ELMoTokenCharactersIndexer() tokens = [Token(word) for word in words] indices = indexer.tokens_to_indices(tokens, vocab, u"elmo") # There are 457 tokens. Reshape into 10 batches of 50 tokens. sentences = [] for k in range(10): char_indices = indices[u"elmo"][(k * 50):((k + 1) * 50)] sentences.append( indexer.pad_token_sequence( {u'key': char_indices}, desired_num_tokens={u'key': 50}, padding_lengths={} )[u'key'] ) batch = torch.from_numpy(numpy.array(sentences)) elmo_token_embedder = _ElmoCharacterEncoder(self.options_file, self.weight_file) elmo_token_embedder_output = elmo_token_embedder(batch) # Reshape back to a list of words and compare with ground truth. Need to also # remove <S>, </S> actual_embeddings = remove_sentence_boundaries( elmo_token_embedder_output[u'token_embedding'], elmo_token_embedder_output[u'mask'] )[0].data.numpy() actual_embeddings = actual_embeddings.reshape(-1, actual_embeddings.shape[-1]) embedding_file = os.path.join(self.elmo_fixtures_path, u'elmo_token_embeddings.hdf5') with h5py.File(embedding_file, u'r') as fin: expected_embeddings = fin[u'embedding'][...] assert numpy.allclose(actual_embeddings[:len(tokens)], expected_embeddings, atol=1e-6)
Example #6
Source File: elmo_test.py From allennlp with Apache License 2.0 | 4 votes |
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() dataset = AllennlpDataset(instances, vocab) # Now finally we can iterate through batches. loader = PyTorchDataLoader(dataset, 3) for i, batch in enumerate(loader): lm_embeddings = elmo_bilm(batch["elmo"]["character_ids"]["elmo_tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings["activations"][2], lm_embeddings["mask"] ) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [len(sentence.split()) for sentence in batch_sentences] assert lengths.tolist() == expected_lengths # get the expected embeddings and compare! expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): assert numpy.allclose( top_layer_embeddings[k, : lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6, )
Example #7
Source File: elmo.py From magnitude with MIT License | 4 votes |
def create_cached_cnn_embeddings(self, tokens ) : u""" Given a list of tokens, this method precomputes word representations by running just the character convolutions and highway layers of elmo, essentially creating uncontextual word vectors. On subsequent forward passes, the word ids are looked up from an embedding, rather than being computed on the fly via the CNN encoder. This function sets 3 attributes: _word_embedding : ``torch.Tensor`` The word embedding for each word in the tokens passed to this method. _bos_embedding : ``torch.Tensor`` The embedding for the BOS token. _eos_embedding : ``torch.Tensor`` The embedding for the EOS token. Parameters ---------- tokens : ``List[str]``, required. A list of tokens to precompute character convolutions for. """ tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens timesteps = 32 batch_size = 32 chunked_tokens = lazy_groups_of(iter(tokens), timesteps) all_embeddings = [] device = get_device_of(next(self.parameters())) for batch in lazy_groups_of(chunked_tokens, batch_size): # Shape (batch_size, timesteps, 50) batched_tensor = batch_to_ids(batch) # NOTE: This device check is for when a user calls this method having # already placed the model on a device. If this is called in the # constructor, it will probably happen on the CPU. This isn't too bad, # because it's only a few convolutions and will likely be very fast. if device >= 0: batched_tensor = batched_tensor.cuda(device) output = self._token_embedder(batched_tensor) token_embedding = output[u"token_embedding"] mask = output[u"mask"] token_embedding, _ = remove_sentence_boundaries(token_embedding, mask) all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1))) full_embedding = torch.cat(all_embeddings, 0) # We might have some trailing embeddings from padding in the batch, so # we clip the embedding and lookup to the right size. full_embedding = full_embedding[:len(tokens), :] embedding = full_embedding[2:len(tokens), :] vocab_size, embedding_dim = list(embedding.size()) from allennlp.modules.token_embedders import Embedding # type: ignore self._bos_embedding = full_embedding[0, :] self._eos_embedding = full_embedding[1, :] self._word_embedding = Embedding(vocab_size, # type: ignore embedding_dim, weight=embedding.data, trainable=self._requires_grad, padding_index=0)
Example #8
Source File: elmo_test.py From magnitude with MIT License | 4 votes |
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in izip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {u'character_ids': indexer}) instance = Instance({u"elmo": field}) instances.append(instance) vocab = Vocabulary() # Now finally we can iterate through batches. iterator = BasicIterator(3) iterator.index_with(vocab) for i, batch in enumerate(iterator(instances, num_epochs=1, shuffle=False)): lm_embeddings = elmo_bilm(batch[u'elmo'][u'character_ids']) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings[u'activations'][2], lm_embeddings[u'mask'] ) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [expected_lm_embeddings[k][i] for k in range(3)] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6 ) )
Example #9
Source File: get_ELMo_word_embedding_for_a_dataset.py From slot_filling_and_intent_detection_of_SLU with Apache License 2.0 | 4 votes |
def forward(self, inputs): """ Parameters ---------- inputs: ``torch.Tensor``, required. Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. word_inputs : ``torch.Tensor``, required. If you passed a cached vocab, you can in addition pass a tensor of shape ``(batch_size, timesteps)``, which represent word ids which have been pre-cached. Returns ------- Dict with keys: """ # reshape the input if needed original_shape = inputs.size() if len(original_shape) > 3: timesteps, num_characters = original_shape[-2:] reshaped_inputs = inputs.view(-1, timesteps, num_characters) else: reshaped_inputs = inputs # run the biLM bilm_output = self._elmo_lstm(reshaped_inputs, None) layer_activations = bilm_output['activations'] mask_with_bos_eos = bilm_output['mask'] word_embedding_and_hiddens = torch.cat(layer_activations, dim=-1) assert self.output_dim * len(layer_activations) == word_embedding_and_hiddens.size(-1) # compute the elmo representations representation_with_bos_eos = word_embedding_and_hiddens representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(representation_with_bos_eos, mask_with_bos_eos) processed_representation = representation_without_bos_eos processed_mask = mask_without_bos_eos # reshape if necessary out_representations = [] out_representations.append(processed_representation[:, :, :self.output_dim]) if len(layer_activations) > 1: for i in range(1, len(layer_activations)): out_representations.append(processed_representation[:, :, self.output_dim * i : self.output_dim * (i + 1)]) return {'elmo_representations': out_representations, 'mask': processed_mask}