Python utils.save_checkpoint() Examples

The following are 4 code examples of utils.save_checkpoint(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module utils , or try the search function .
Example #1
Source File: nasbench_weight_sharing_policy.py    From eval-nas with MIT License 5 votes vote down vote up
def run(self):
        """
        Procedure of training. This run describes the entire training procedure.
        :return:
        """
        train_queue, valid_queue, test_queue, criterion = self.initialize_run()
        args = self.args
        model, optimizer, scheduler = self.initialize_model()
        fitness_dict = {}
        self.optimizer = optimizer
        self.scheduler = scheduler
        logging.info(">> Begin the search with supernet method :".format(args.supernet_train_method))

        for epoch in range(args.epochs):
            scheduler.step()
            lr = scheduler.get_lr()[0]

            train_acc, train_obj = self.train_fn(train_queue, valid_queue, model, criterion, optimizer, lr)
            self.logging_fn(train_acc, train_obj, epoch, 'Train', display_dict={'lr': lr})

            # validation
            valid_acc, valid_obj = self.validate_model(model, valid_queue, self.model_spec_id, self.model_spec)
            self.logging_fn(valid_acc, valid_obj, epoch, 'Valid')

            if not self.check_should_save(epoch):
                continue
            # evaluate process.
            self.save_duplicate_arch_pool('valid', epoch)
            fitness_dict = self.evaluate(epoch, test_queue, fitnesses_dict=fitness_dict, train_queue=train_queue)
            utils.save_checkpoint(model, optimizer, self.running_stats, self.exp_dir)
            self.save_results(epoch, rank_details=True)

        # add later, return the model specs that is evaluated across the time.
        # Process the ranking in the end, return the best of training.
        ep_k = [k for k in self.ranking_per_epoch.keys()][-1]
        best_id = self.ranking_per_epoch[ep_k][-1][1].geno_id
        return best_id, self.search_space.nasbench_model_specs[best_id] 
Example #2
Source File: train.py    From NER-BERT-pytorch with MIT License 4 votes vote down vote up
def train_and_evaluate(model, train_data, val_data, optimizer, scheduler, params, model_dir, restore_file=None):
    """Train the model and evaluate every epoch."""
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
        
    best_val_f1 = 0.0
    patience_counter = 0

    for epoch in range(1, params.epoch_num + 1):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch, params.epoch_num))

        # Compute number of batches in one epoch
        params.train_steps = params.train_size // params.batch_size
        params.val_steps = params.val_size // params.batch_size

        # data iterator for training
        train_data_iterator = data_loader.data_iterator(train_data, shuffle=True)
        # Train for one epoch on training set
        train(model, train_data_iterator, optimizer, scheduler, params)

        # data iterator for evaluation
        train_data_iterator = data_loader.data_iterator(train_data, shuffle=False)
        val_data_iterator = data_loader.data_iterator(val_data, shuffle=False)

        # Evaluate for one epoch on training set and validation set
        params.eval_steps = params.train_steps
        train_metrics = evaluate(model, train_data_iterator, params, mark='Train')
        params.eval_steps = params.val_steps
        val_metrics = evaluate(model, val_data_iterator, params, mark='Val')
        
        val_f1 = val_metrics['f1']
        improve_f1 = val_f1 - best_val_f1

        # Save weights of the network
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        optimizer_to_save = optimizer.optimizer if args.fp16 else optimizer
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model_to_save.state_dict(),
                               'optim_dict': optimizer_to_save.state_dict()},
                               is_best=improve_f1>0,
                               checkpoint=model_dir)
        if improve_f1 > 0:
            logging.info("- Found new best F1")
            best_val_f1 = val_f1
            if improve_f1 < params.patience:
                patience_counter += 1
            else:
                patience_counter = 0
        else:
            patience_counter += 1

        # Early stopping and logging best f1
        if (patience_counter >= params.patience_num and epoch > params.min_epoch_num) or epoch == params.epoch_num:
            logging.info("Best val f1: {:05.2f}".format(best_val_f1))
            break 
Example #3
Source File: train.py    From Pytorch-STN with MIT License 4 votes vote down vote up
def train_and_eval(net, train_loader, val_loader, optimizer, loss_fn, metrics, params, model_dir, restore=None):
    """
    Train and evaluate every epoch of a model.
    net: The model. 
    train/val loader: The data loaders
    params: The parameters parsed from JSON file 
    restore: if there is a checkpoint restore from that point. 
    """
    best_val_acc = 0.0 
    if restore is not None:
        restore_file = os.path.join(args.param_path, args.resume_path + '_pth.tar')
        logging.info("Loaded checkpoints from:{}".format(restore_file))
        utils.load_checkpoint(restore_file, net, optimizer)

    for ep in range(params.num_epochs):
        logging.info("Running epoch: {}/{}".format(ep+1, params.num_epochs))

        # train one epoch 
        train(net, train_loader, loss_fn, params, metrics, optimizer)

        val_metrics = evaluate(net, val_loader, loss_fn, params, metrics)

        val_acc = val_metrics['accuracy']
        isbest = val_acc >= best_val_acc 

        utils.save_checkpoint({"epoch":ep, "state_dict":net.state_dict(), "optimizer":optimizer.state_dict()}, 
        isBest=isbest, ckpt_dir=model_dir)
    
        if isbest:
            # if the accuracy is great  save it to best.json 
            logging.info("New best accuracy found!")
            best_val_acc = val_acc 
            best_json_path = os.path.join(model_dir, "best_model_params.json")
            utils.save_dict_to_json(val_metrics, best_json_path)
        
        last_acc_path = os.path.join(model_dir, 'last_acc_metrics.json')
        utils.save_dict_to_json(val_metrics, last_acc_path) 
Example #4
Source File: nasbench_weight_sharing_policy.py    From eval-nas with MIT License 4 votes vote down vote up
def run(self):
        """
        Difference with super.run() is, it will change the eval pool by random sampling some new architecture to replace
        the old ones.
        :return:
        """
        train_queue, valid_queue, test_queue, criterion = self.initialize_run()
        repeat_valid_queue = RepeatedDataLoader(valid_queue)
        args = self.args
        model, optimizer, scheduler = self.initialize_model()
        fitness_dict = {}
        self.optimizer = optimizer
        self.scheduler = scheduler
        logging.info(">> Begin the search with supernet method: {}".format(args.supernet_train_method))
        logging.info("Always setting BN-train to True!")
        for epoch in range(args.epochs):
            args.current_epoch = epoch
            lr = scheduler.get_lr()[0]
            logging.info('epoch %d lr %e', epoch, lr)

            # training for each epoch.
            train_acc, train_obj = self.train_fn(train_queue, valid_queue, model, criterion, optimizer, lr)
            self.logging_fn(train_acc, train_obj, epoch, 'Train', display_dict={'lr': lr})

            # do this after pytorch 1.1.0
            scheduler.step()

            # validation, compare to the traditional, we only evaluate the eval arch pool in this step.
            validate_accuracies, valid_acc, valid_obj = self.child_valid(
                model, repeat_valid_queue, self.evaluate_model_spec_id_pool(), criterion)

            self.logging_fn(valid_acc, valid_obj, epoch, 'Valid')

            if not self.check_should_save(epoch):
                continue
            self.save_duplicate_arch_pool('valid', epoch)
            logging.info("Evaluating and save the results.")
            utils.save_checkpoint(model, optimizer, self.running_stats, self.exp_dir)
            logging.info("Totally %d architectures now to evaluate", len(self.evaluate_model_spec_id_pool()))
            # evaluate steps.
            fitness_dict = self.evaluate(epoch, test_queue, fitnesses_dict=fitness_dict, train_queue=train_queue)
            # Generate new archs for robust evaluation.
            # replace bottom archs
            num_new_archs = self.search_space.replace_eval_ids_by_random(args.controller_random_arch)
            logging.info("Generate %d new archs", num_new_archs)
            self.save_results(epoch, rank_details=True)

        # add later, return the model specs that is evaluated across the time.
        # Process the ranking in the end, return the best of training.
        ep_k = [k for k in self.ranking_per_epoch.keys()][-1]
        best_id = self.ranking_per_epoch[ep_k][-1][1].geno_id
        return best_id, self.nasbench_model_specs[best_id]