Python utils.save_checkpoint() Examples
The following are 4
code examples of utils.save_checkpoint().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
utils
, or try the search function
.
Example #1
Source File: nasbench_weight_sharing_policy.py From eval-nas with MIT License | 5 votes |
def run(self): """ Procedure of training. This run describes the entire training procedure. :return: """ train_queue, valid_queue, test_queue, criterion = self.initialize_run() args = self.args model, optimizer, scheduler = self.initialize_model() fitness_dict = {} self.optimizer = optimizer self.scheduler = scheduler logging.info(">> Begin the search with supernet method :".format(args.supernet_train_method)) for epoch in range(args.epochs): scheduler.step() lr = scheduler.get_lr()[0] train_acc, train_obj = self.train_fn(train_queue, valid_queue, model, criterion, optimizer, lr) self.logging_fn(train_acc, train_obj, epoch, 'Train', display_dict={'lr': lr}) # validation valid_acc, valid_obj = self.validate_model(model, valid_queue, self.model_spec_id, self.model_spec) self.logging_fn(valid_acc, valid_obj, epoch, 'Valid') if not self.check_should_save(epoch): continue # evaluate process. self.save_duplicate_arch_pool('valid', epoch) fitness_dict = self.evaluate(epoch, test_queue, fitnesses_dict=fitness_dict, train_queue=train_queue) utils.save_checkpoint(model, optimizer, self.running_stats, self.exp_dir) self.save_results(epoch, rank_details=True) # add later, return the model specs that is evaluated across the time. # Process the ranking in the end, return the best of training. ep_k = [k for k in self.ranking_per_epoch.keys()][-1] best_id = self.ranking_per_epoch[ep_k][-1][1].geno_id return best_id, self.search_space.nasbench_model_specs[best_id]
Example #2
Source File: train.py From NER-BERT-pytorch with MIT License | 4 votes |
def train_and_evaluate(model, train_data, val_data, optimizer, scheduler, params, model_dir, restore_file=None): """Train the model and evaluate every epoch.""" # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_f1 = 0.0 patience_counter = 0 for epoch in range(1, params.epoch_num + 1): # Run one epoch logging.info("Epoch {}/{}".format(epoch, params.epoch_num)) # Compute number of batches in one epoch params.train_steps = params.train_size // params.batch_size params.val_steps = params.val_size // params.batch_size # data iterator for training train_data_iterator = data_loader.data_iterator(train_data, shuffle=True) # Train for one epoch on training set train(model, train_data_iterator, optimizer, scheduler, params) # data iterator for evaluation train_data_iterator = data_loader.data_iterator(train_data, shuffle=False) val_data_iterator = data_loader.data_iterator(val_data, shuffle=False) # Evaluate for one epoch on training set and validation set params.eval_steps = params.train_steps train_metrics = evaluate(model, train_data_iterator, params, mark='Train') params.eval_steps = params.val_steps val_metrics = evaluate(model, val_data_iterator, params, mark='Val') val_f1 = val_metrics['f1'] improve_f1 = val_f1 - best_val_f1 # Save weights of the network model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self optimizer_to_save = optimizer.optimizer if args.fp16 else optimizer utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model_to_save.state_dict(), 'optim_dict': optimizer_to_save.state_dict()}, is_best=improve_f1>0, checkpoint=model_dir) if improve_f1 > 0: logging.info("- Found new best F1") best_val_f1 = val_f1 if improve_f1 < params.patience: patience_counter += 1 else: patience_counter = 0 else: patience_counter += 1 # Early stopping and logging best f1 if (patience_counter >= params.patience_num and epoch > params.min_epoch_num) or epoch == params.epoch_num: logging.info("Best val f1: {:05.2f}".format(best_val_f1)) break
Example #3
Source File: train.py From Pytorch-STN with MIT License | 4 votes |
def train_and_eval(net, train_loader, val_loader, optimizer, loss_fn, metrics, params, model_dir, restore=None): """ Train and evaluate every epoch of a model. net: The model. train/val loader: The data loaders params: The parameters parsed from JSON file restore: if there is a checkpoint restore from that point. """ best_val_acc = 0.0 if restore is not None: restore_file = os.path.join(args.param_path, args.resume_path + '_pth.tar') logging.info("Loaded checkpoints from:{}".format(restore_file)) utils.load_checkpoint(restore_file, net, optimizer) for ep in range(params.num_epochs): logging.info("Running epoch: {}/{}".format(ep+1, params.num_epochs)) # train one epoch train(net, train_loader, loss_fn, params, metrics, optimizer) val_metrics = evaluate(net, val_loader, loss_fn, params, metrics) val_acc = val_metrics['accuracy'] isbest = val_acc >= best_val_acc utils.save_checkpoint({"epoch":ep, "state_dict":net.state_dict(), "optimizer":optimizer.state_dict()}, isBest=isbest, ckpt_dir=model_dir) if isbest: # if the accuracy is great save it to best.json logging.info("New best accuracy found!") best_val_acc = val_acc best_json_path = os.path.join(model_dir, "best_model_params.json") utils.save_dict_to_json(val_metrics, best_json_path) last_acc_path = os.path.join(model_dir, 'last_acc_metrics.json') utils.save_dict_to_json(val_metrics, last_acc_path)
Example #4
Source File: nasbench_weight_sharing_policy.py From eval-nas with MIT License | 4 votes |
def run(self): """ Difference with super.run() is, it will change the eval pool by random sampling some new architecture to replace the old ones. :return: """ train_queue, valid_queue, test_queue, criterion = self.initialize_run() repeat_valid_queue = RepeatedDataLoader(valid_queue) args = self.args model, optimizer, scheduler = self.initialize_model() fitness_dict = {} self.optimizer = optimizer self.scheduler = scheduler logging.info(">> Begin the search with supernet method: {}".format(args.supernet_train_method)) logging.info("Always setting BN-train to True!") for epoch in range(args.epochs): args.current_epoch = epoch lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) # training for each epoch. train_acc, train_obj = self.train_fn(train_queue, valid_queue, model, criterion, optimizer, lr) self.logging_fn(train_acc, train_obj, epoch, 'Train', display_dict={'lr': lr}) # do this after pytorch 1.1.0 scheduler.step() # validation, compare to the traditional, we only evaluate the eval arch pool in this step. validate_accuracies, valid_acc, valid_obj = self.child_valid( model, repeat_valid_queue, self.evaluate_model_spec_id_pool(), criterion) self.logging_fn(valid_acc, valid_obj, epoch, 'Valid') if not self.check_should_save(epoch): continue self.save_duplicate_arch_pool('valid', epoch) logging.info("Evaluating and save the results.") utils.save_checkpoint(model, optimizer, self.running_stats, self.exp_dir) logging.info("Totally %d architectures now to evaluate", len(self.evaluate_model_spec_id_pool())) # evaluate steps. fitness_dict = self.evaluate(epoch, test_queue, fitnesses_dict=fitness_dict, train_queue=train_queue) # Generate new archs for robust evaluation. # replace bottom archs num_new_archs = self.search_space.replace_eval_ids_by_random(args.controller_random_arch) logging.info("Generate %d new archs", num_new_archs) self.save_results(epoch, rank_details=True) # add later, return the model specs that is evaluated across the time. # Process the ranking in the end, return the best of training. ep_k = [k for k in self.ranking_per_epoch.keys()][-1] best_id = self.ranking_per_epoch[ep_k][-1][1].geno_id return best_id, self.nasbench_model_specs[best_id]