Python tensorpack.utils.gpu.get_num_gpu() Examples

The following are 8 code examples of tensorpack.utils.gpu.get_num_gpu(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module tensorpack.utils.gpu , or try the search function .
Example #1
Source File: train.py    From Regional-Homogeneity with MIT License 5 votes vote down vote up
def get_config(model):
    nr_tower = max(get_num_gpu(), 1)
    assert FLAGS.batch % nr_tower == 0
    batch = FLAGS.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))

    data = QueueInput(get_dataflow(FLAGS.train_list_filename, batch))

    # learning rate
    START_LR = FLAGS.lr
    BASE_LR = START_LR * (FLAGS.batch / 256.0)
    lr_list = []
    for idx, decay_point in enumerate(FLAGS.lr_decay_points):
        lr_list.append((decay_point, BASE_LR * 0.1 ** idx))
    callbacks = [
        ScopeModelSaver(checkpoint_dir=FLAGS.RHP_savepath, scope='RHP'),
        EstimatedTimeLeft(),
        ScheduledHyperParamSetter('learning_rate', lr_list),
    ]

    if get_num_gpu() > 0:
        callbacks.append(GPUUtilizationTracker())

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=FLAGS.steps_per_epoch // FLAGS.batch,
        max_epoch=FLAGS.max_epoch,
        session_init=MultipleRestore()
    ) 
Example #2
Source File: vgg16.py    From tensorpack with Apache License 2.0 5 votes vote down vote up
def get_config():
    nr_tower = max(get_num_gpu(), 1)
    batch = args.batch
    total_batch = batch * nr_tower
    assert total_batch >= 256   # otherwise the learning rate warmup is wrong.
    BASE_LR = 0.01 * (total_batch / 256.)

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
    callbacks = [
        ModelSaver(),
        GPUUtilizationTracker(),
        EstimatedTimeLeft(),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(0, 0.01), (3, max(BASE_LR, 0.01))], interp='linear'),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]),
        DataParallelInferenceRunner(
            dataset_val, infs, list(range(nr_tower))),
    ]

    input = QueueInput(dataset_train)
    input = StagingInput(input, nr_stage=1)
    return TrainConfig(
        model=Model(),
        data=input,
        callbacks=callbacks,
        steps_per_epoch=1281167 // total_batch,
        max_epoch=100,
    ) 
Example #3
Source File: alexnet.py    From tensorpack with Apache License 2.0 5 votes vote down vote up
def get_config():
    nr_tower = max(get_num_gpu(), 1)
    batch = args.batch
    total_batch = batch * nr_tower
    if total_batch != 128:
        logger.warn("AlexNet needs to be trained with a total batch size of 128.")
    BASE_LR = 0.01 * (total_batch / 128.)

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    dataset_train = get_data('train', batch)
    dataset_val = get_data('val', batch)

    infs = [ClassificationError('wrong-top1', 'val-error-top1'),
            ClassificationError('wrong-top5', 'val-error-top5')]
    callbacks = [
        ModelSaver(),
        GPUUtilizationTracker(),
        EstimatedTimeLeft(),
        ScheduledHyperParamSetter(
            'learning_rate',
            [(0, BASE_LR), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2), (80, BASE_LR * 1e-3)]),
        DataParallelInferenceRunner(
            dataset_val, infs, list(range(nr_tower))),
    ]

    return TrainConfig(
        model=Model(),
        data=StagingInput(QueueInput(dataset_train)),
        callbacks=callbacks,
        steps_per_epoch=1281167 // total_batch,
        max_epoch=100,
    ) 
Example #4
Source File: train_tf.py    From imgclsmob with MIT License 4 votes vote down vote up
def train_net(net,
              session_init,
              batch_size,
              num_epochs,
              train_dataflow,
              val_dataflow):

    num_towers = max(get_num_gpu(), 1)
    batch_per_tower = batch_size // num_towers
    logger.info("Running on {} towers. Batch size per tower: {}".format(num_towers, batch_per_tower))

    num_training_samples = 1281167
    step_size = num_training_samples // batch_size
    max_iter = (num_epochs - 1) * step_size
    callbacks = [
        ModelSaver(),
        ScheduledHyperParamSetter(
            "learning_rate",
            [(0, 0.5), (max_iter, 0)],
            interp="linear",
            step_based=True),
        EstimatedTimeLeft()]

    infs = [ClassificationError("wrong-top1", "val-error-top1"),
            ClassificationError("wrong-top5", "val-error-top5")]
    if num_towers == 1:
        # single-GPU inference with queue prefetch
        callbacks.append(InferenceRunner(
            input=QueueInput(val_dataflow),
            infs=infs))
    else:
        # multi-GPU inference (with mandatory queue prefetch)
        callbacks.append(DataParallelInferenceRunner(
            input=val_dataflow,
            infs=infs,
            gpus=list(range(num_towers))))

    config = TrainConfig(
        dataflow=train_dataflow,
        model=net,
        callbacks=callbacks,
        session_init=session_init,
        steps_per_epoch=step_size,
        max_epoch=num_epochs)

    launch_train_with_config(
        config=config,
        trainer=SyncMultiGPUTrainerParameterServer(num_towers)) 
Example #5
Source File: imagenet-resnet-gn.py    From GroupNorm-reproduce with Apache License 2.0 4 votes vote down vote up
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    if fake:
        logger.info("For benchmark, batch size is fixed to 64 per tower.")
        dataset_train = FakeData(
            [[64, 224, 224, 3], [64]], 1000, random=False, dtype='uint8')
        callbacks = []
        steps_per_epoch = 100
    else:
        logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))

        dataset_train = get_imagenet_dataflow(args.data, 'train', batch)
        dataset_val = get_imagenet_dataflow(args.data, 'val', min(64, batch))
        steps_per_epoch = 1281167 // args.batch

        BASE_LR = 0.1 * args.batch / 256.0
        logger.info("BASELR: {}".format(BASE_LR))
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            GPUUtilizationTracker(),
            ScheduledHyperParamSetter(
                'learning_rate', [(0, BASE_LR), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                                  (90, BASE_LR * 1e-3)]),
        ]
        if BASE_LR > 0.1:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, 0.1), (5 * steps_per_epoch, BASE_LR)],
                    interp='linear', step_based=True))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return TrainConfig(
        model=model,
        dataflow=dataset_train,
        callbacks=callbacks,
        steps_per_epoch=steps_per_epoch,
        max_epoch=100,
    ) 
Example #6
Source File: adanet-resnet.py    From adanet with MIT License 4 votes vote down vote up
def get_config(model, fake=False):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn("Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported.")
    if fake:
        data = QueueInput(FakeData(
            [[batch, 224, 224, 3], [batch],[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8'))
        callbacks = []
    else:
        data = QueueInput(get_data('train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [
                    (0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (45, BASE_LR * 1e-2),
                    (55, BASE_LR * 1e-3)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        dataset_val = get_data('val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    return AutoResumeTrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1280000 // args.batch,
        max_epoch=60,
    ) 
Example #7
Source File: imagenet-resnet.py    From tensorpack with Apache License 2.0 4 votes vote down vote up
def get_config(model):
    nr_tower = max(get_num_gpu(), 1)
    assert args.batch % nr_tower == 0
    batch = args.batch // nr_tower

    logger.info("Running on {} towers. Batch size per tower: {}".format(nr_tower, batch))
    if batch < 32 or batch > 64:
        logger.warn("Batch size per tower not in [32, 64]. This probably will lead to worse accuracy than reported.")
    if args.fake:
        data = QueueInput(FakeData(
            [[batch, 224, 224, 3], [batch]], 1000, random=False, dtype='uint8'))
        callbacks = []
    else:
        if args.symbolic:
            data = TFDatasetInput(get_imagenet_tfdata(args.data, 'train', batch))
        else:
            data = QueueInput(get_imagenet_dataflow(args.data, 'train', batch))

        START_LR = 0.1
        BASE_LR = START_LR * (args.batch / 256.0)
        callbacks = [
            ModelSaver(),
            EstimatedTimeLeft(),
            ScheduledHyperParamSetter(
                'learning_rate', [
                    (0, min(START_LR, BASE_LR)), (30, BASE_LR * 1e-1), (60, BASE_LR * 1e-2),
                    (90, BASE_LR * 1e-3), (100, BASE_LR * 1e-4)]),
        ]
        if BASE_LR > START_LR:
            callbacks.append(
                ScheduledHyperParamSetter(
                    'learning_rate', [(0, START_LR), (5, BASE_LR)], interp='linear'))

        infs = [ClassificationError('wrong-top1', 'val-error-top1'),
                ClassificationError('wrong-top5', 'val-error-top5')]
        dataset_val = get_imagenet_dataflow(args.data, 'val', batch)
        if nr_tower == 1:
            # single-GPU inference with queue prefetch
            callbacks.append(InferenceRunner(QueueInput(dataset_val), infs))
        else:
            # multi-GPU inference (with mandatory queue prefetch)
            callbacks.append(DataParallelInferenceRunner(
                dataset_val, infs, list(range(nr_tower))))

    if get_num_gpu() > 0:
        callbacks.append(GPUUtilizationTracker())

    return TrainConfig(
        model=model,
        data=data,
        callbacks=callbacks,
        steps_per_epoch=100 if args.fake else 1281167 // args.batch,
        max_epoch=105,
    ) 
Example #8
Source File: train-atari.py    From tensorpack with Apache License 2.0 4 votes vote down vote up
def train():
    # assign GPUs for training & inference
    num_gpu = get_num_gpu()
    global PREDICTOR_THREAD
    if num_gpu > 0:
        if num_gpu > 1:
            # use half gpus for inference
            predict_tower = list(range(num_gpu))[-num_gpu // 2:]
        else:
            predict_tower = [0]
        PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
        train_tower = list(range(num_gpu))[:-num_gpu // 2] or [0]
        logger.info("[Batch-A3C] Train on gpu {} and infer on gpu {}".format(
            ','.join(map(str, train_tower)), ','.join(map(str, predict_tower))))
    else:
        logger.warn("Without GPU this model will never learn! CPU is only useful for debug.")
        PREDICTOR_THREAD = 1
        predict_tower, train_tower = [0], [0]

    # setup simulator processes
    name_base = str(uuid.uuid1())[:6]
    prefix = '@' if sys.platform.startswith('linux') else ''
    namec2s = 'ipc://{}sim-c2s-{}'.format(prefix, name_base)
    names2c = 'ipc://{}sim-s2c-{}'.format(prefix, name_base)
    procs = [MySimulatorWorker(k, namec2s, names2c) for k in range(SIMULATOR_PROC)]
    ensure_proc_terminate(procs)
    start_proc_mask_signal(procs)

    master = MySimulatorMaster(namec2s, names2c, predict_tower)
    config = TrainConfig(
        model=Model(),
        dataflow=master.get_training_dataflow(),
        callbacks=[
            ModelSaver(),
            ScheduledHyperParamSetter('learning_rate', [(20, 0.0003), (120, 0.0001)]),
            ScheduledHyperParamSetter('entropy_beta', [(80, 0.005)]),
            master,
            PeriodicTrigger(Evaluator(
                EVAL_EPISODE, ['state'], ['policy'], get_player),
                every_k_epochs=3),
        ],
        session_creator=sesscreate.NewSessionCreator(config=get_default_sess_config(0.5)),
        steps_per_epoch=STEPS_PER_EPOCH,
        session_init=SmartInit(args.load),
        max_epoch=1000,
    )
    trainer = SimpleTrainer() if num_gpu == 1 else AsyncMultiGPUTrainer(train_tower)
    launch_train_with_config(config, trainer)