Python tensorpack.utils.gpu.get_num_gpu() Examples
The following are 8
code examples of tensorpack.utils.gpu.get_num_gpu().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
tensorpack.utils.gpu
, or try the search function
.
Example #1
Source File: train.py From Regional-Homogeneity with MIT License | 5 votes |
def get_config(model): nr_tower = max(get_num_gpu(), 1) assert FLAGS.batch % nr_tower == 0 batch = FLAGS.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) data = QueueInput(get_dataflow(FLAGS.train_list_filename, batch)) # learning rate START_LR = FLAGS.lr BASE_LR = START_LR * (FLAGS.batch / 256.0) lr_list = [] for idx, decay_point in enumerate(FLAGS.lr_decay_points): lr_list.append((decay_point, BASE_LR * 0.1 ** idx)) callbacks = [ ScopeModelSaver(checkpoint_dir=FLAGS.RHP_savepath, scope='RHP'), EstimatedTimeLeft(), ScheduledHyperParamSetter('learning_rate', lr_list), ] if get_num_gpu() > 0: callbacks.append(GPUUtilizationTracker()) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=FLAGS.steps_per_epoch // FLAGS.batch, max_epoch=FLAGS.max_epoch, session_init=MultipleRestore() )
Example #2
Source File: vgg16.py From tensorpack with Apache License 2.0 | 5 votes |
def get_config(): nr_tower = max(get_num_gpu(), 1) batch = args.batch total_batch = batch * nr_tower assert total_batch >= 256 # otherwise the learning rate warmup is wrong. BASE_LR = 0.01 * (total_batch / 256.) logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] callbacks = [ ModelSaver(), GPUUtilizationTracker(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [(0, 0.01), (3, max(BASE_LR, 0.01))], interp='linear'), ScheduledHyperParamSetter( 'learning_rate', [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]), DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower))), ] input = QueueInput(dataset_train) input = StagingInput(input, nr_stage=1) return TrainConfig( model=Model(), data=input, callbacks=callbacks, steps_per_epoch=1281167 // total_batch, max_epoch=100, )
Example #3
Source File: alexnet.py From tensorpack with Apache License 2.0 | 5 votes |
def get_config(): nr_tower = max(get_num_gpu(), 1) batch = args.batch total_batch = batch * nr_tower if total_batch != 128: logger.warn("AlexNet needs to be trained with a total batch size of 128.") BASE_LR = 0.01 * (total_batch / 128.) logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_data('train', batch) dataset_val = get_data('val', batch) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] callbacks = [ ModelSaver(), GPUUtilizationTracker(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [(0, BASE_LR), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]), DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower))), ] return TrainConfig( model=Model(), data=StagingInput(QueueInput(dataset_train)), callbacks=callbacks, steps_per_epoch=1281167 // total_batch, max_epoch=100, )
Example #4
Source File: train_tf.py From imgclsmob with MIT License | 4 votes |
def train_net(net, session_init, batch_size, num_epochs, train_dataflow, val_dataflow): num_towers = max(get_num_gpu(), 1) batch_per_tower = batch_size // num_towers logger.info("Running on {} towers. Batch size per tower: {}".format(num_towers, batch_per_tower)) num_training_samples = 1281167 step_size = num_training_samples // batch_size max_iter = (num_epochs - 1) * step_size callbacks = [ ModelSaver(), ScheduledHyperParamSetter( "learning_rate", [(0, 0.5), (max_iter, 0)], interp="linear", step_based=True), EstimatedTimeLeft()] infs = [ClassificationError("wrong-top1", "val-error-top1"), ClassificationError("wrong-top5", "val-error-top5")] if num_towers == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner( input=QueueInput(val_dataflow), infs=infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( input=val_dataflow, infs=infs, gpus=list(range(num_towers)))) config = TrainConfig( dataflow=train_dataflow, model=net, callbacks=callbacks, session_init=session_init, steps_per_epoch=step_size, max_epoch=num_epochs) launch_train_with_config( config=config, trainer=SyncMultiGPUTrainerParameterServer(num_towers))
Example #5
Source File: imagenet-resnet-gn.py From GroupNorm-reproduce with Apache License 2.0 | 4 votes |
def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower if fake: logger.info("For benchmark, batch size is fixed to 64 per tower.") dataset_train = FakeData( [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8') callbacks = [] steps_per_epoch = 100 else: logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) dataset_train = get_imagenet_dataflow(args.data, 'train', batch) dataset_val = get_imagenet_dataflow(args.data, 'val', min(64, batch)) steps_per_epoch = 1281167 // args.batch BASE_LR = 0.1 * args.batch / 256.0 logger.info("BASELR: {}".format(BASE_LR)) callbacks = [ ModelSaver(), EstimatedTimeLeft(), GPUUtilizationTracker(), ScheduledHyperParamSetter( 'learning_rate', [(0, BASE_LR), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3)]), ] if BASE_LR > 0.1: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, 0.1), (5 * steps_per_epoch, BASE_LR)], interp='linear', step_based=True)) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return TrainConfig( model=model, dataflow=dataset_train, callbacks=callbacks, steps_per_epoch=steps_per_epoch, max_epoch=100, )
Example #6
Source File: adanet-resnet.py From adanet with MIT License | 4 votes |
def get_config(model, fake=False): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) if batch < 32 or batch > 64: logger.warn("Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported.") if fake: data = QueueInput(FakeData( [[batch, 224, 224, 3], [batch],[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) callbacks = [] else: data = QueueInput(get_data('train', batch)) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [ (0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (45, BASE_LR * 1e-2), (55, BASE_LR * 1e-3)]), ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] dataset_val = get_data('val', batch) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) return AutoResumeTrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1280000 // args.batch, max_epoch=60, )
Example #7
Source File: imagenet-resnet.py From tensorpack with Apache License 2.0 | 4 votes |
def get_config(model): nr_tower = max(get_num_gpu(), 1) assert args.batch % nr_tower == 0 batch = args.batch // nr_tower logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch)) if batch < 32 or batch > 64: logger.warn("Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported.") if args.fake: data = QueueInput(FakeData( [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8')) callbacks = [] else: if args.symbolic: data = TFDatasetInput(get_imagenet_tfdata(args.data, 'train', batch)) else: data = QueueInput(get_imagenet_dataflow(args.data, 'train', batch)) START_LR = 0.1 BASE_LR = START_LR * (args.batch / 256.0) callbacks = [ ModelSaver(), EstimatedTimeLeft(), ScheduledHyperParamSetter( 'learning_rate', [ (0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]), ] if BASE_LR > START_LR: callbacks.append( ScheduledHyperParamSetter( 'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear')) infs = [ClassificationError('wrong-top1', 'val-error-top1'), ClassificationError('wrong-top5', 'val-error-top5')] dataset_val = get_imagenet_dataflow(args.data, 'val', batch) if nr_tower == 1: # single-GPU inference with queue prefetch callbacks.append(InferenceRunner(QueueInput(dataset_val), infs)) else: # multi-GPU inference (with mandatory queue prefetch) callbacks.append(DataParallelInferenceRunner( dataset_val, infs, list(range(nr_tower)))) if get_num_gpu() > 0: callbacks.append(GPUUtilizationTracker()) return TrainConfig( model=model, data=data, callbacks=callbacks, steps_per_epoch=100 if args.fake else 1281167 // args.batch, max_epoch=105, )
Example #8
Source File: train-atari.py From tensorpack with Apache License 2.0 | 4 votes |
def train(): # assign GPUs for training & inference num_gpu = get_num_gpu() global PREDICTOR_THREAD if num_gpu > 0: if num_gpu > 1: # use half gpus for inference predict_tower = list(range(num_gpu))[-num_gpu // 2:] else: predict_tower = [0] PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU train_tower = list(range(num_gpu))[:-num_gpu // 2] or [0] logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format( ','.join(map(str, train_tower)), ','.join(map(str, predict_tower)))) else: logger.warn("Without GPU this model will never learn! CPU is only useful for debug.") PREDICTOR_THREAD = 1 predict_tower, train_tower = [0], [0] # setup simulator processes name_base = str(uuid.uuid1())[:6] prefix = '@' if sys.platform.startswith('linux') else '' namec2s = 'ipc://{}sim-c2s-{}'.format(prefix, name_base) names2c = 'ipc://{}sim-s2c-{}'.format(prefix, name_base) procs = [MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC)] ensure_proc_terminate(procs) start_proc_mask_signal(procs) master = MySimulatorMaster(namec2s, names2c, predict_tower) config = TrainConfig( model=Model(), dataflow=master.get_training_dataflow(), callbacks=[ ModelSaver(), ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]), ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]), master, PeriodicTrigger(Evaluator( EVAL_EPISODE, ['state'], ['policy'], get_player), every_k_epochs=3), ], session_creator=sesscreate.NewSessionCreator(config=get_default_sess_config(0.5)), steps_per_epoch=STEPS_PER_EPOCH, session_init=SmartInit(args.load), max_epoch=1000, ) trainer = SimpleTrainer() if num_gpu == 1 else AsyncMultiGPUTrainer(train_tower) launch_train_with_config(config, trainer)