Python Examples of utils.save

Source File: nasbench_weight_sharing_policy.py From eval-nas with MIT License

5 votes

def run(self):
        """
        Procedure of training. This run describes the entire training procedure.
        :return:
        """
        train_queue, valid_queue, test_queue, criterion = self.initialize_run()
        args = self.args
        model, optimizer, scheduler = self.initialize_model()
        fitness_dict = {}
        self.optimizer = optimizer
        self.scheduler = scheduler
        logging.info(">> Begin the search with supernet method :".format(args.supernet_train_method))

        for epoch in range(args.epochs):
            scheduler.step()
            lr = scheduler.get_lr()[0]

            train_acc, train_obj = self.train_fn(train_queue, valid_queue, model, criterion, optimizer, lr)
            self.logging_fn(train_acc, train_obj, epoch, 'Train', display_dict={'lr': lr})

            # validation
            valid_acc, valid_obj = self.validate_model(model, valid_queue, self.model_spec_id, self.model_spec)
            self.logging_fn(valid_acc, valid_obj, epoch, 'Valid')

            if not self.check_should_save(epoch):
                continue
            # evaluate process.
            self.save_duplicate_arch_pool('valid', epoch)
            fitness_dict = self.evaluate(epoch, test_queue, fitnesses_dict=fitness_dict, train_queue=train_queue)
            utils.save_checkpoint(model, optimizer, self.running_stats, self.exp_dir)
            self.save_results(epoch, rank_details=True)

        # add later, return the model specs that is evaluated across the time.
        # Process the ranking in the end, return the best of training.
        ep_k = [k for k in self.ranking_per_epoch.keys()][-1]
        best_id = self.ranking_per_epoch[ep_k][-1][1].geno_id
        return best_id, self.search_space.nasbench_model_specs[best_id]

Source File: train.py From NER-BERT-pytorch with MIT License

4 votes

def train_and_evaluate(model, train_data, val_data, optimizer, scheduler, params, model_dir, restore_file=None):
    """Train the model and evaluate every epoch."""
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)
        
    best_val_f1 = 0.0
    patience_counter = 0

    for epoch in range(1, params.epoch_num + 1):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch, params.epoch_num))

        # Compute number of batches in one epoch
        params.train_steps = params.train_size // params.batch_size
        params.val_steps = params.val_size // params.batch_size

        # data iterator for training
        train_data_iterator = data_loader.data_iterator(train_data, shuffle=True)
        # Train for one epoch on training set
        train(model, train_data_iterator, optimizer, scheduler, params)

        # data iterator for evaluation
        train_data_iterator = data_loader.data_iterator(train_data, shuffle=False)
        val_data_iterator = data_loader.data_iterator(val_data, shuffle=False)

        # Evaluate for one epoch on training set and validation set
        params.eval_steps = params.train_steps
        train_metrics = evaluate(model, train_data_iterator, params, mark='Train')
        params.eval_steps = params.val_steps
        val_metrics = evaluate(model, val_data_iterator, params, mark='Val')
        
        val_f1 = val_metrics['f1']
        improve_f1 = val_f1 - best_val_f1

        # Save weights of the network
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        optimizer_to_save = optimizer.optimizer if args.fp16 else optimizer
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model_to_save.state_dict(),
                               'optim_dict': optimizer_to_save.state_dict()},
                               is_best=improve_f1>0,
                               checkpoint=model_dir)
        if improve_f1 > 0:
            logging.info("- Found new best F1")
            best_val_f1 = val_f1
            if improve_f1 < params.patience:
                patience_counter += 1
            else:
                patience_counter = 0
        else:
            patience_counter += 1

        # Early stopping and logging best f1
        if (patience_counter >= params.patience_num and epoch > params.min_epoch_num) or epoch == params.epoch_num:
            logging.info("Best val f1: {:05.2f}".format(best_val_f1))
            break

Source File: train.py From Pytorch-STN with MIT License

4 votes

def train_and_eval(net, train_loader, val_loader, optimizer, loss_fn, metrics, params, model_dir, restore=None):
    """
    Train and evaluate every epoch of a model.
    net: The model. 
    train/val loader: The data loaders
    params: The parameters parsed from JSON file 
    restore: if there is a checkpoint restore from that point. 
    """
    best_val_acc = 0.0 
    if restore is not None:
        restore_file = os.path.join(args.param_path, args.resume_path + '_pth.tar')
        logging.info("Loaded checkpoints from:{}".format(restore_file))
        utils.load_checkpoint(restore_file, net, optimizer)

    for ep in range(params.num_epochs):
        logging.info("Running epoch: {}/{}".format(ep+1, params.num_epochs))

        # train one epoch 
        train(net, train_loader, loss_fn, params, metrics, optimizer)

        val_metrics = evaluate(net, val_loader, loss_fn, params, metrics)

        val_acc = val_metrics['accuracy']
        isbest = val_acc >= best_val_acc 

        utils.save_checkpoint({"epoch":ep, "state_dict":net.state_dict(), "optimizer":optimizer.state_dict()}, 
        isBest=isbest, ckpt_dir=model_dir)
    
        if isbest:
            # if the accuracy is great  save it to best.json 
            logging.info("New best accuracy found!")
            best_val_acc = val_acc 
            best_json_path = os.path.join(model_dir, "best_model_params.json")
            utils.save_dict_to_json(val_metrics, best_json_path)
        
        last_acc_path = os.path.join(model_dir, 'last_acc_metrics.json')
        utils.save_dict_to_json(val_metrics, last_acc_path)

Source File: nasbench_weight_sharing_policy.py From eval-nas with MIT License

4 votes

def run(self):
        """
        Difference with super.run() is, it will change the eval pool by random sampling some new architecture to replace
        the old ones.
        :return:
        """
        train_queue, valid_queue, test_queue, criterion = self.initialize_run()
        repeat_valid_queue = RepeatedDataLoader(valid_queue)
        args = self.args
        model, optimizer, scheduler = self.initialize_model()
        fitness_dict = {}
        self.optimizer = optimizer
        self.scheduler = scheduler
        logging.info(">> Begin the search with supernet method: {}".format(args.supernet_train_method))
        logging.info("Always setting BN-train to True!")
        for epoch in range(args.epochs):
            args.current_epoch = epoch
            lr = scheduler.get_lr()[0]
            logging.info('epoch %d lr %e', epoch, lr)

            # training for each epoch.
            train_acc, train_obj = self.train_fn(train_queue, valid_queue, model, criterion, optimizer, lr)
            self.logging_fn(train_acc, train_obj, epoch, 'Train', display_dict={'lr': lr})

            # do this after pytorch 1.1.0
            scheduler.step()

            # validation, compare to the traditional, we only evaluate the eval arch pool in this step.
            validate_accuracies, valid_acc, valid_obj = self.child_valid(
                model, repeat_valid_queue, self.evaluate_model_spec_id_pool(), criterion)

            self.logging_fn(valid_acc, valid_obj, epoch, 'Valid')

            if not self.check_should_save(epoch):
                continue
            self.save_duplicate_arch_pool('valid', epoch)
            logging.info("Evaluating and save the results.")
            utils.save_checkpoint(model, optimizer, self.running_stats, self.exp_dir)
            logging.info("Totally %d architectures now to evaluate", len(self.evaluate_model_spec_id_pool()))
            # evaluate steps.
            fitness_dict = self.evaluate(epoch, test_queue, fitnesses_dict=fitness_dict, train_queue=train_queue)
            # Generate new archs for robust evaluation.
            # replace bottom archs
            num_new_archs = self.search_space.replace_eval_ids_by_random(args.controller_random_arch)
            logging.info("Generate %d new archs", num_new_archs)
            self.save_results(epoch, rank_details=True)

        # add later, return the model specs that is evaluated across the time.
        # Process the ranking in the end, return the best of training.
        ep_k = [k for k in self.ranking_per_epoch.keys()][-1]
        best_id = self.ranking_per_epoch[ep_k][-1][1].geno_id
        return best_id, self.nasbench_model_specs[best_id]

Python utils.save_checkpoint() Examples