Python Examples of allennlp.common.util.lazy_groups

Source File: predict.py From allennlp with Apache License 2.0

6 votes

def run(self) -> None:
        has_reader = self._dataset_reader is not None
        index = 0
        if has_reader:
            for batch in lazy_groups_of(self._get_instance_data(), self._batch_size):
                for model_input_instance, result in zip(batch, self._predict_instances(batch)):
                    self._maybe_print_to_console_and_file(index, result, str(model_input_instance))
                    index = index + 1
        else:
            for batch_json in lazy_groups_of(self._get_json_data(), self._batch_size):
                for model_input_json, result in zip(batch_json, self._predict_json(batch_json)):
                    self._maybe_print_to_console_and_file(
                        index, result, json.dumps(model_input_json)
                    )
                    index = index + 1

        if self._output_file is not None:
            self._output_file.close()

Source File: elmo.py From magnitude with MIT License

6 votes

def embed_sentences(self,
                        sentences                     ,
                        batch_size      = DEFAULT_BATCH_SIZE)                           :
        u"""
        Computes the ELMo embeddings for a iterable of sentences.

        Please note that ELMo has internal state and will give different results for the same input.
        See the comment under the class definition.

        Parameters
        ----------
        sentences : ``Iterable[List[str]]``, required
            An iterable of tokenized sentences.
        batch_size : ``int``, required
            The number of sentences ELMo should process at once.

        Returns
        -------
            A list of tensors, each representing the ELMo vectors for the input sentence at the same index.
        """
        for batch in lazy_groups_of(iter(sentences), batch_size):
            for _i in self.embed_batch(batch):
                yield _i

Source File: bucket_batch_sampler.py From allennlp with Apache License 2.0

5 votes

def __iter__(self) -> Iterable[List[int]]:
        indices, _ = self._argsort_by_padding(self.data_source)
        batches = []
        for group in lazy_groups_of(indices, self.batch_size):
            batch_indices = list(group)
            if self.drop_last and len(batch_indices) < self.batch_size:
                continue
            batches.append(batch_indices)
        random.shuffle(batches)
        for batch in batches:
            yield batch

Source File: util_test.py From allennlp with Apache License 2.0

5 votes

def test_lazy_groups_of(self):
        xs = [1, 2, 3, 4, 5, 6, 7]
        groups = util.lazy_groups_of(iter(xs), group_size=3)
        assert next(groups) == [1, 2, 3]
        assert next(groups) == [4, 5, 6]
        assert next(groups) == [7]
        with pytest.raises(StopIteration):
            _ = next(groups)

Source File: predict.py From magnitude with MIT License

5 votes

def run(self)        :
        has_reader = self._dataset_reader is not None
        if has_reader:
            for batch in lazy_groups_of(self._get_instance_data(), self._batch_size):
                for result in self._predict_instances(batch):
                    self._maybe_print_to_console_and_file(result)
        else:
            for batch_json in lazy_groups_of(self._get_json_data(), self._batch_size):
                for model_input, result in izip(batch_json, self._predict_json(batch_json)):
                    self._maybe_print_to_console_and_file(result, json.dumps(model_input))

        if self._output_file is not None:
            self._output_file.close()

Source File: data_iterator.py From magnitude with MIT License

5 votes

def _memory_sized_lists(self,
                            instances                    )                            :
        u"""
        Breaks the dataset into "memory-sized" lists of instances,
        which it yields up one at a time until it gets through a full epoch.

        For example, if the dataset is already an in-memory list, and each epoch
        represents one pass through the dataset, it just yields back the dataset.
        Whereas if the dataset is lazily read from disk and we've specified to
        load 1000 instances at a time, then it yields lists of 1000 instances each.
        """
        lazy = is_lazy(instances)

        # Get an iterator over the next epoch worth of instances.
        iterator = self._take_instances(instances, self._instances_per_epoch)

        # We have four different cases to deal with:

        # With lazy instances and no guidance about how many to load into memory,
        # we just load ``batch_size`` instances at a time:
        if lazy and self._max_instances_in_memory is None:
            _i = lazy_groups_of(iterator, self._batch_size)
            while True:
                yield _i.next()
        # If we specified max instances in memory, lazy or not, we just
        # load ``max_instances_in_memory`` instances at a time:
        elif self._max_instances_in_memory is not None:
            _i = lazy_groups_of(iterator, self._max_instances_in_memory)
            while True:
                yield _i.next()
        # If we have non-lazy instances, and we want all instances each epoch,
        # then we just yield back the list of instances:
        elif self._instances_per_epoch is None:
            yield ensure_list(instances)
        # In the final case we have non-lazy instances, we want a specific number
        # of instances each epoch, and we didn't specify how to many instances to load
        # into memory. So we convert the whole iterator to a list:
        else:
            yield list(iterator)

Source File: basic_iterator.py From magnitude with MIT License

5 votes

def _create_batches(self, instances                    , shuffle      )                   :
        # First break the dataset into memory-sized lists:
        for instance_list in self._memory_sized_lists(instances):
            if shuffle:
                random.shuffle(instance_list)
            iterator = iter(instance_list)
            # Then break each memory-sized list into batches.
            for batch_instances in lazy_groups_of(iterator, self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
                    batch = Batch(possibly_smaller_batches)
                    yield batch

Source File: test_util.py From magnitude with MIT License

5 votes

def test_lazy_groups_of(self):
        xs = [1, 2, 3, 4, 5, 6, 7]
        groups = util.lazy_groups_of(iter(xs), group_size=3)
        assert next(groups) == [1, 2, 3]
        assert next(groups) == [4, 5, 6]
        assert next(groups) == [7]
        with pytest.raises(StopIteration):
            _ = next(groups)

Source File: predict_with_vocab_expansion.py From multee with Apache License 2.0

5 votes

def run(self) -> None:
        # Instead of lazy reading of data, load them at once, extend the vocab for model
        # and then predict in batches.
        has_reader = self._dataset_reader is not None
        index = 0
        
        if has_reader:
            instances = list(self._get_instance_data())
        else:
            jsons = list(self._get_json_data())
            instances = [self._predictor._json_to_instance(json) for json in jsons]

        embedding_sources: Dict[str, str] = (json.loads(self._embedding_sources_mapping)
                                             if self._embedding_sources_mapping else {})
        self._predictor._model.vocab.extend_from_instances(Params({}), instances=instances)
        self._predictor._model.extend_embedder_vocab(embedding_sources)

        if has_reader:
            for batch in lazy_groups_of(iter(instances), self._batch_size):
                for model_input_instance, result in zip(batch, self._predict_instances(batch)):
                    self._maybe_print_to_console_and_file(index, result, str(model_input_instance))
                    index = index + 1
        else:
            for batch_json in lazy_groups_of(iter(jsons), self._batch_size):
                for model_input_json, result in zip(batch_json, self._predict_json(batch_json)):
                    self._maybe_print_to_console_and_file(index, result, json.dumps(model_input_json))
                    index = index + 1

        if self._output_file is not None:
            self._output_file.close()

Source File: bag_iterator.py From DISTRE with Apache License 2.0

4 votes

def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
        for instance_list in self._memory_sized_lists(instances):

            bags = {}

            for instance_id, mentions in groupby(
                sorted(instance_list, key=lambda instance: instance['metadata']['instance_id']),
                key=lambda instance: instance['metadata']['instance_id']):

                bags[instance_id] = list(mentions)

            shuffled_instance_ids = list(bags.keys())
            random.shuffle(shuffled_instance_ids)

            print('creating new instances')

            new_instances = []
            for instance_id in shuffled_instance_ids:
                mentions = bags[instance_id]
                if shuffle:
                    random.shuffle(mentions)
                new_instances.extend(mentions)

            print('creating batches')

            batches = []
            for batch_instances in lazy_groups_of(iter(new_instances), self._batch_size):
                for possibly_smaller_batches in self._ensure_batch_is_sufficiently_small(batch_instances):
                    batches.append(Batch(possibly_smaller_batches))
            
            print('num batches:', len(batches))

            move_to_front = self._biggest_batch_first and len(batches) > 1
            if move_to_front:
                # We'll actually pop the last _two_ batches, because the last one might not be full.
                last_batch = batches.pop()
                penultimate_batch = batches.pop()
            if move_to_front:
                batches.insert(0, penultimate_batch)
                batches.insert(0, last_batch)

            print('yielding from batches')
            yield from batches

Source File: trainer.py From NLP_Toolkit with Apache License 2.0

4 votes

def _validation_loss(self) -> Tuple[float, int]:
        """
        Computes the validation loss. Returns it and the number of batches.
        """
        logger.info("Validating")

        self.model.eval()

        # Replace parameter values with the shadow values from the moving averages.
        if self._moving_average is not None:
            self._moving_average.assign_average_value()

        if self._validation_iterator is not None:
            val_iterator = self._validation_iterator
        else:
            val_iterator = self.iterator

        num_gpus = len(self._cuda_devices)

        raw_val_generator = val_iterator(self._validation_data, num_epochs=1, shuffle=False)
        val_generator = lazy_groups_of(raw_val_generator, num_gpus)
        num_validation_batches = math.ceil(
            val_iterator.get_num_batches(self._validation_data) / num_gpus
        )
        val_generator_tqdm = Tqdm.tqdm(val_generator, total=num_validation_batches)
        batches_this_epoch = 0
        val_loss = 0
        for batch_group in val_generator_tqdm:

            loss = self.batch_loss(batch_group, for_training=False)
            if loss is not None:
                # You shouldn't necessarily have to compute a loss for validation, so we allow for
                # `loss` to be None.  We need to be careful, though - `batches_this_epoch` is
                # currently only used as the divisor for the loss function, so we can safely only
                # count those batches for which we actually have a loss.  If this variable ever
                # gets used for something else, we might need to change things around a bit.
                batches_this_epoch += 1
                val_loss += loss.detach().cpu().numpy()

            # Update the description with the latest metrics
            val_metrics = training_util.get_metrics(self.model, val_loss, batches_this_epoch)
            description = training_util.description_from_metrics(val_metrics)
            val_generator_tqdm.set_description(description, refresh=False)

        # Now restore the original parameter values.
        if self._moving_average is not None:
            self._moving_average.restore()

        return val_loss, batches_this_epoch

Source File: elmo_sequential_embedder.py From Unsupervised-Sentence-Summarization with MIT License

4 votes

def embed_sentences(self,
                        sentences: Iterable[List[str]],
                        add_bos: bool = False,
                        add_eos: bool = False,
                        initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
                        batch_size: int = DEFAULT_BATCH_SIZE) -> \
                    List[Tuple[numpy.ndarray, Tuple[torch.Tensor, torch.Tensor]]]:
        """
        Computes the forward only ELMo embeddings for a iterable of sentences.
        See the comment under the class definition.
        Parameters
        ----------
        sentences : ``Iterable[List[str]]``, required
            An iterable of tokenized sentences.
        add_bos: ``bool``
            Whether to add begin of sentence token.
        add_eos: ``bool``
            Whether to add end of sentence token.
        initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
            A tuple (state, memory) representing the initial hidden state and memory
            of the LSTM, with shape (num_layers, batch_size, 1 * hidden_size) and
            (num_layers, batch_size, 1 * cell_size) respectively.
            
            Or, with shape (num_layers, 1 * hidden_size) and
            (num_layers, 1 * cell_size) respectively, if all the batch share the same initial_state.
        batch_size : ``int``, required
            The number of sentences ELMo should process at once.
        Returns
        -------
            A list of tuple of (numpy.ndarray/torch.Tensor, (torch.Tensor, torch.Tensor)), 
            each representing the ELMo vectors for the input sentence 
            at the same index, and the final states after running that sentence, with shape (num_layers, hidden_size) and
            (num_layers, cell_size) respectively.
            (The return type could also be a generator. Can convert to a list using list().)
        """
        embeddings_and_states = []
        print('Embedding sentences into forward ELMo vectors ---')
#         for batch in Tqdm.tqdm(lazy_groups_of(iter(sentences), batch_size)):
        for batch in lazy_groups_of(iter(sentences), batch_size):
            elmo_embeddings, final_states = self.forward(batch, add_bos, add_eos, initial_state)
            # Remember: final_states is a tuple of tensors
            final_states_chunked = []
            for i in range(2):
                final_states_chunked.append(list(map(lambda x: torch.squeeze(x, dim=1),
                                                     final_states[i].chunk(final_states[i].size(1), dim=1))))
            final_states_chunked = list(zip(*final_states_chunked))
            assert len(elmo_embeddings) == len(final_states_chunked), 'length of embeddings and final states mismatch'
#            yield from zip(elmo_embeddings, final_states_chunked)
            embeddings_and_states += list(zip(elmo_embeddings, final_states_chunked))
        return embeddings_and_states

Source File: elmo_sequential_embedder.py From Unsupervised-Sentence-Summarization with MIT License

4 votes

def create_cached_cnn_embeddings(self, tokens: List[str]) -> None:
        """
        Given a list of tokens, this method precomputes word representations
        by running just the character convolutions and highway layers of elmo,
        essentially creating uncontextual word vectors. On subsequent forward passes,
        the word ids are looked up from an embedding, rather than being computed on
        the fly via the CNN encoder.
        This function sets 3 attributes:
        _word_embedding : ``torch.Tensor``
            The word embedding for each word in the tokens passed to this method.
        _bos_embedding : ``torch.Tensor``
            The embedding for the BOS token.
        _eos_embedding : ``torch.Tensor``
            The embedding for the EOS token.
        Parameters
        ----------
        tokens : ``List[str]``, required.
            A list of tokens to precompute character convolutions for.
        """
        tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
        timesteps = 32
        batch_size = 32
        chunked_tokens = lazy_groups_of(iter(tokens), timesteps)

        all_embeddings = []
        device = get_device_of(next(self.parameters()))
        for batch in lazy_groups_of(chunked_tokens, batch_size):
            # Shape (batch_size, timesteps, 50)
            batched_tensor = batch_to_ids(batch)
            # NOTE: This device check is for when a user calls this method having
            # already placed the model on a device. If this is called in the
            # constructor, it will probably happen on the CPU. This isn't too bad,
            # because it's only a few convolutions and will likely be very fast.
            if device >= 0:
                batched_tensor = batched_tensor.cuda(device)
            output = self._token_embedder(batched_tensor, add_bos=False, add_eos=False)
            token_embedding = output["token_embedding"]
            mask = output["mask"]
            token_embedding, _ = remove_sentence_boundaries(token_embedding, mask, rmv_bos=False, rmv_eos=False)
            all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
        full_embedding = torch.cat(all_embeddings, 0)

        # We might have some trailing embeddings from padding in the batch, so
        # we clip the embedding and lookup to the right size.
        full_embedding = full_embedding[:len(tokens), :]
        embedding = full_embedding[2:len(tokens), :]
        vocab_size, embedding_dim = list(embedding.size())

        from allennlp.modules.token_embedders import Embedding # type: ignore
        self._bos_embedding = full_embedding[0, :]
        self._eos_embedding = full_embedding[1, :]
        self._word_embedding = Embedding(vocab_size, # type: ignore
                                         embedding_dim,
                                         weight=embedding.data,
                                         trainable=self._requires_grad,
                                         padding_index=0)

Source File: elmo.py From magnitude with MIT License

4 votes

def create_cached_cnn_embeddings(self, tokens           )        :
        u"""
        Given a list of tokens, this method precomputes word representations
        by running just the character convolutions and highway layers of elmo,
        essentially creating uncontextual word vectors. On subsequent forward passes,
        the word ids are looked up from an embedding, rather than being computed on
        the fly via the CNN encoder.

        This function sets 3 attributes:

        _word_embedding : ``torch.Tensor``
            The word embedding for each word in the tokens passed to this method.
        _bos_embedding : ``torch.Tensor``
            The embedding for the BOS token.
        _eos_embedding : ``torch.Tensor``
            The embedding for the EOS token.

        Parameters
        ----------
        tokens : ``List[str]``, required.
            A list of tokens to precompute character convolutions for.
        """
        tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
        timesteps = 32
        batch_size = 32
        chunked_tokens = lazy_groups_of(iter(tokens), timesteps)

        all_embeddings = []
        device = get_device_of(next(self.parameters()))
        for batch in lazy_groups_of(chunked_tokens, batch_size):
            # Shape (batch_size, timesteps, 50)
            batched_tensor = batch_to_ids(batch)
            # NOTE: This device check is for when a user calls this method having
            # already placed the model on a device. If this is called in the
            # constructor, it will probably happen on the CPU. This isn't too bad,
            # because it's only a few convolutions and will likely be very fast.
            if device >= 0:
                batched_tensor = batched_tensor.cuda(device)
            output = self._token_embedder(batched_tensor)
            token_embedding = output[u"token_embedding"]
            mask = output[u"mask"]
            token_embedding, _ = remove_sentence_boundaries(token_embedding, mask)
            all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
        full_embedding = torch.cat(all_embeddings, 0)

        # We might have some trailing embeddings from padding in the batch, so
        # we clip the embedding and lookup to the right size.
        full_embedding = full_embedding[:len(tokens), :]
        embedding = full_embedding[2:len(tokens), :]
        vocab_size, embedding_dim = list(embedding.size())

        from allennlp.modules.token_embedders import Embedding # type: ignore
        self._bos_embedding = full_embedding[0, :]
        self._eos_embedding = full_embedding[1, :]
        self._word_embedding = Embedding(vocab_size, # type: ignore
                                         embedding_dim,
                                         weight=embedding.data,
                                         trainable=self._requires_grad,
                                         padding_index=0)

Python allennlp.common.util.lazy_groups_of() Examples