Python tensorpack.utils.gpu.get_nr_gpu() Examples

The following are 6 code examples of tensorpack.utils.gpu.get_nr_gpu(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorpack.utils.gpu , or try the search function .
Example #1
Source File: ssh.py    From tensorflow-recipes with Apache License 2.0 6 votes vote down vote up
def get_config():
    global BATCH
    nr_tower = max(get_nr_gpu(), 1)
    BATCH = TOTAL_BATCH_SIZE // nr_tower
    logger.set_logger_dir()

    ds_train = get_data('train')
    ds_test = get_data('test')

    return TrainConfig(
        model=Model(),
        data=QueueInput(ds_train),
        callbacks=[
            ModelSaver(),
            InferenceRunner(ds_test, [ScalarStats('total_costs')]),
        ],
        extra_callbacks=[
            MovingAverageSummary(),
            ProgressBar(['']),
            MergeAllSummaries(),
            RunUpdateOps()
        ],
        steps_per_epoch=ds_train.size(),
        max_epoch=100,
    ) 
Example #2
Source File: train.py    From ADL with MIT License 6 votes vote down vote up
def main():
    args = get_args()
    nr_gpu = get_nr_gpu()
    args.batch_size = args.batch_size // nr_gpu

    model = Model(args)

    if args.evaluate:
        evaluate_wsol(args, model, interval=False)
        sys.exit()

    logger.set_logger_dir(ospj('train_log', args.log_dir))
    config = get_config(model, args)

    if args.use_pretrained_model:
        config.session_init = get_model_loader(_CKPT_NAMES[args.arch_name])

    launch_train_with_config(config,
                             SyncMultiGPUTrainerParameterServer(nr_gpu))

    evaluate_wsol(args, model, interval=True) 
Example #3
Source File: imagenet.py    From LQ-Nets with MIT License 5 votes vote down vote up
def get_config(model, fake=False, data_aug=True):
    nr_tower = max(get_nr_gpu(), 1)
    batch = TOTAL_BATCH_SIZE // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch, data_aug)
        dataset_val = get_data('val', batch, data_aug)
        callbacks = [
            ModelSaver(),
        ]
        if data_aug:
            callbacks.append(ScheduledHyperParamSetter('learning_rate',
                                                       [(30, 1e-2), (60, 1e-3), (85, 1e-4), (95, 1e-5), (105, 1e-6)]))
        callbacks.append(HumanHyperParamSetter('learning_rate'))
        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return AutoResumeTrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=5000 if TOTAL_BATCH_SIZE == 256 else 10000,
        max_epoch=110 if data_aug else 64,
        nr_tower=nr_tower
    ) 
Example #4
Source File: train.py    From ADL with MIT License 5 votes vote down vote up
def get_steps_per_epoch(option):
    nr_gpu = get_nr_gpu()
    total_batch = option.batch_size * nr_gpu

    if option.dataset_name == 'CUB':
        steps_per_epoch = 25 * (256 / total_batch) * option.stepscale
    elif option.dataset_name == 'ILSVRC':
        steps_per_epoch = 5000 * (256 / total_batch) * option.stepscale
    else:
        raise KeyError("Unavailable dataset: {}".format(option.dataset_name))
    return int(steps_per_epoch) 
Example #5
Source File: train.py    From PReMVOS with MIT License 5 votes vote down vote up
def get_batch_factor():
    nr_gpu = get_nr_gpu()
    assert nr_gpu in [1, 2, 4, 8], nr_gpu
    return 8 // nr_gpu 
Example #6
Source File: imagenet-resnet.py    From webvision-2.0-benchmarks with Apache License 2.0 4 votes vote down vote up
def get_config(model, fake=False):
    nr_tower = max(get_nr_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
        dataset_train = get_data('train', batch)
        dataset_val = get_data('val', batch)

        BASE_LR = 0.1 * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            ScheduledHyperParamSetter(
                'learning_rate', [(0, BASE_LR), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                                  (90, BASE_LR * 1e-3)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, 0.1), (3, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=110,
    )