Python Examples of mxnet.MXNetError

Source File: dist_sync_kvstore.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0

6 votes

def test_invalid_operations():
    def check_invalid_gluon_trainer_reset():
        params = mx.gluon.ParameterDict()
        x = params.get('x', shape=(4, 2), lr_mult=1.0, stype='row_sparse')
        params.initialize(ctx=mx.cpu(0), init='zeros')
        trainer = mx.gluon.Trainer(params, 'sgd', {'learning_rate': 0.1}, kvstore=kv)
        params.save('test_gluon_trainer_reset_' + str(my_rank) + '.params')
        row_id = mx.nd.arange(0, 4)
        w = x.row_sparse_data(row_id)
        assert trainer._kv_initialized and trainer._update_on_kvstore
        mx.nd.waitall()
        # load would fail to reset kvstore since update_on_kvstore is True
        assert_exception(params.load, RuntimeError, 'test_gluon_trainer_reset_' + str(my_rank) + '.params')
        print('worker ' + str(my_rank) + ' passed check_invalid_gluon_trainer_reset')

    def check_invalid_pull():
        kv.init(keys_invalid[0], mx.nd.ones((2,2)).tostype('row_sparse'))
        out = mx.nd.ones((2,2)).tostype('row_sparse')
        assert_exception(kv.pull, mx.MXNetError, 'invalid_key', out=out, ignore_sparse=False)
        print('worker ' + str(my_rank) + ' passed check_invalid_pull')

    check_invalid_gluon_trainer_reset()
    check_invalid_pull()

Source File: test_metric_perf.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0

6 votes

def run_metric(name, data_gen_cls, i, n, c, pred_ctx, label_ctx, **kwargs):
    """ Helper function for running one metric benchmark """
    metric = mx.metric.create(name, **kwargs)
    data_gen = data_gen_cls(n, c, pred_ctx, label_ctx)
    try:
        label, pred = data_gen.data()
        mx.nd.waitall()
        before = time.time()
        metric.update([label] * i, [pred] * i)
        mx.nd.waitall()
        elapsed = time.time() - before
        elapsed_str = "{:<.5}".format(elapsed)
    except mx.MXNetError:
        elapsed_str = "FAILED"
    print("{metric:<15}{pctx:<10}{lctx:<12}{niter:<12}{bs:<15}{out_dim:<15}{elapsed:<}".format(
        metric=name, pctx=str(pred_ctx), lctx=str(label_ctx), niter=i * n, bs=data_gen.batch_size,
        out_dim=data_gen.output_dim, elapsed=elapsed_str), file=sys.stderr)

Source File: utils.py From sockeye with Apache License 2.0

6 votes

def get_gpu_memory_usage(ctx: Union[mx.context.Context, List[mx.context.Context]]) -> Dict[int, Tuple[int, int]]:
    """
    Returns used and total memory for GPUs identified by the given context list.

    :param ctx: List of MXNet context devices.
    :return: Dictionary of device id mapping to a tuple of (memory used, memory total).
    """
    if isinstance(ctx, mx.context.Context):
        ctx = [ctx]
    ctx = [c for c in ctx if c.device_type == 'gpu']
    if not ctx:
        return {}

    memory_data = {}  # type: Dict[int, Tuple[int, int]]
    for c in ctx:
        try:
            free, total = mx.context.gpu_memory_info(device_id=c.device_id)  # in bytes
            used = total - free
            memory_data[c.device_id] = (used * 1e-06, total * 1e-06)
        except mx.MXNetError:
            logger.exception("Failed retrieving memory data for gpu%d", c.device_id)
            continue
    log_gpu_memory_usage(memory_data)
    return memory_data

Source File: utils.py From datawig with Apache License 2.0

6 votes

def get_context() -> mx.context:
    """

    Returns the a list of all available gpu contexts for a given machine.
    If no gpus are available, returns [mx.cpu()].
    Use it to automatically return MxNet contexts (uses max number of gpus or cpu)

    :return: List of mxnet contexts of a gpu or [mx.cpu()] if gpu not available

    """
    context_list = []
    for gpu_number in range(16):
        try:
            _ = mx.nd.array([1, 2, 3], ctx=mx.gpu(gpu_number))
            context_list.append(mx.gpu(gpu_number))
        except mx.MXNetError:
            pass

    if len(context_list) == 0:
        context_list.append(mx.cpu())

    return context_list

Source File: test_metric_perf.py From SNIPER-mxnet with Apache License 2.0

6 votes

def run_metric(name, data_gen_cls, i, n, c, pred_ctx, label_ctx, **kwargs):
    """ Helper function for running one metric benchmark """
    metric = mx.metric.create(name, **kwargs)
    data_gen = data_gen_cls(n, c, pred_ctx, label_ctx)
    try:
        label, pred = data_gen.data()
        mx.nd.waitall()
        before = time.time()
        metric.update([label] * i, [pred] * i)
        mx.nd.waitall()
        elapsed = time.time() - before
        elapsed_str = "{:<.5}".format(elapsed)
    except mx.MXNetError:
        elapsed_str = "FAILED"
    print("{metric:<15}{pctx:<10}{lctx:<12}{niter:<12}{bs:<15}{out_dim:<15}{elapsed:<}".format(
        metric=name, pctx=str(pred_ctx), lctx=str(label_ctx), niter=i * n, bs=data_gen.batch_size,
        out_dim=data_gen.output_dim, elapsed=elapsed_str), file=sys.stderr)

Source File: __init__.py From dgl with Apache License 2.0

5 votes

def is_cuda_available():
    # TODO: Does MXNet have a convenient function to test GPU availability/compilation?
    try:
        a = nd.array([1, 2, 3], ctx=mx.gpu())
        return True
    except mx.MXNetError:
        return False

Source File: utils.py From sockeye with Apache License 2.0

5 votes

def get_num_gpus() -> int:
    """
    Gets the number of GPUs available on the host.

    :return: The number of GPUs on the system.
    """
    try:
        return mx.context.num_gpus()
    except mx.MXNetError:
        # Some builds of MXNet will raise a CUDA error when CUDA is not
        # installed on the host.  In this case, zero GPUs are available.
        return 0

Source File: model_handler.py From xfer with Apache License 2.0

5 votes

def get_module(self, iterator, fixed_layer_parameters=None, random_layer_parameters=None):
        """
        Return MXNet Module using the model symbol and parameters.

        :param iterator: MXNet iterator to be used with model.
        :type iterator: :class:`mxnet.io.DataIter`
        :param list(str) fixed_layer_parameters: List of layer parameters to keep fixed.
        :param list(str) random_layer_parameters: List of layer parameters to randomise.
        :return: MXNet module
        :rtype: :class:`mx.module.Module`
        """
        if fixed_layer_parameters is not None:
            fixed_layer_parameters = self._prune_parameters(fixed_layer_parameters)
        if random_layer_parameters is None:
            arg_params, aux_params = self.arg_params.copy(), self.aux_params.copy()
        else:
            arg_params, aux_params = self._remove_random_parameters(random_layer_parameters)
        mod = mx.mod.Module(symbol=self.symbol, context=self.devices, fixed_param_names=fixed_layer_parameters,
                            label_names=(self.layer_names[-1] + "_label",), data_names=(self.data_name,))
        mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label)
        mod.init_params(mx.init.Xavier(rnd_type='gaussian', factor_type='in', magnitude=2))
        try:
            mod.set_params(arg_params, aux_params, allow_missing=True, force_init=True)
        except mx.MXNetError as e:
            exceptions._handle_mxnet_error(e)
        return mod

Source File: test_random.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0

4 votes

def test_sample_multinomial():
    for dtype in ['uint8', 'int32', 'float16', 'float32', 'float64']: # output array types
        for x in [mx.nd.array([[0,1,2,3,4],[4,3,2,1,0]])/10.0, mx.nd.array([0,1,2,3,4])/10.0]:
            dx = mx.nd.ones_like(x)
            mx.contrib.autograd.mark_variables([x], [dx])
            # Adding rtol and increasing samples needed to pass with seed 2951820647
            samples = 10000
            with mx.autograd.record():
                y, prob = mx.nd.random.multinomial(x, shape=samples, get_prob=True, dtype=dtype)
                r = prob * 5
                r.backward()

            assert(np.dtype(dtype) == y.dtype)
            y = y.asnumpy()
            x = x.asnumpy()
            dx = dx.asnumpy()
            if len(x.shape) is 1:
                x = x.reshape((1, x.shape[0]))
                dx = dx.reshape(1, dx.shape[0])
                y = y.reshape((1, y.shape[0]))
                prob = prob.reshape((1, prob.shape[0]))
            for i in range(x.shape[0]):
                freq = np.bincount(y[i,:].astype('int32'), minlength=5)/np.float32(samples)*x[i,:].sum()
                mx.test_utils.assert_almost_equal(freq, x[i], rtol=0.20, atol=1e-1)
                rprob = x[i][y[i].astype('int32')]/x[i].sum()
                mx.test_utils.assert_almost_equal(np.log(rprob), prob.asnumpy()[i], atol=1e-5)

                real_dx = np.zeros((5,))
                for j in range(samples):
                    real_dx[int(y[i][j])] += 5.0 / rprob[j]
                mx.test_utils.assert_almost_equal(real_dx, dx[i, :], rtol=1e-4, atol=1e-5)
    for dtype in ['uint8', 'float16', 'float32']:
        # Bound check for the output data types. 'int32' and 'float64' require large memory so are skipped.
        x = mx.nd.zeros(2 ** 25)  # Larger than the max integer in float32 without precision loss.
        bound_check = False
        try:
            y = mx.nd.random.multinomial(x, dtype=dtype)
        except mx.MXNetError as e:
            bound_check = True
        assert bound_check

# Test the generators with the chi-square testing

Source File: test_random.py From SNIPER-mxnet with Apache License 2.0

4 votes

def test_random_seed_setting_for_context():
    seed_to_test = 1234
    num_temp_seeds = 25
    probs = [0.125, 0.25, 0.25, 0.0625, 0.125, 0.1875]
    num_samples = 100000
    dev_type = mx.context.current_context().device_type
    for dtype in ['float16', 'float32', 'float64']:
        samples_imp = []
        samples_sym = []
        # Collect random number samples from the generators of all devices, each seeded with the same number.
        for dev_id in range(0, 16 if dev_type == 'gpu' else 1):
            # Currently python API does not provide a method to get the number of gpu devices.
            # Waiting for PR #10354, which provides the method, to be merged.
            # As a temporal workaround, try first and catch the exception caused by the absence of the device with `dev_id`.
            try:
                with mx.Context(dev_type, dev_id):
                    ctx = mx.context.current_context()
                    seed = set_seed_variously_for_context(ctx, 1, num_temp_seeds, seed_to_test)

                    # Check imperative. `multinomial` uses non-parallel rng.
                    rnds = mx.nd.random.multinomial(data=mx.nd.array(probs, dtype=dtype), shape=num_samples)
                    samples_imp.append(rnds.asnumpy())

                    # Check symbolic. `multinomial` uses non-parallel rng.
                    P = mx.sym.Variable("P")
                    X = mx.sym.random.multinomial(data=P, shape=num_samples, get_prob=False)
                    exe = X.bind(ctx, {"P": mx.nd.array(probs, dtype=dtype)})
                    set_seed_variously_for_context(ctx, seed, num_temp_seeds, seed_to_test)
                    exe.forward()
                    samples_sym.append(exe.outputs[0].asnumpy())
            except mx.MXNetError as e:
                if str(e).find("invalid device ordinal") != -1:
                    break
                else:
                    raise e
        # The samples should be identical across different gpu devices.
        for i in range(1, len(samples_imp)):
            assert same(samples_imp[i - 1], samples_imp[i])
        for i in range(1, len(samples_sym)):
            assert same(samples_sym[i - 1], samples_sym[i])

# Tests that seed setting of parallel rng for specific context is synchronous w.r.t. rng use before and after.

Source File: test_random.py From SNIPER-mxnet with Apache License 2.0

4 votes

def test_parallel_random_seed_setting_for_context():
    seed_to_test = 1234
    dev_type = mx.context.current_context().device_type
    for dtype in ['float16', 'float32', 'float64']:
        samples_imp = []
        samples_sym = []
        # Collect random number samples from the generators of all devices, each seeded with the same number.
        for dev_id in range(0, 16 if dev_type == 'gpu' else 1):
            # Currently python API does not provide a method to get the number of gpu devices.
            # Waiting for PR #10354, which provides the method, to be merged.
            # As a temporal workaround, try first and catch the exception caused by the absence of the device with `dev_id`.
            try:
                with mx.Context(dev_type, dev_id):
                    ctx = mx.context.current_context()
                    # Avoid excessive test cpu runtimes.
                    num_temp_seeds = 25 if dev_type == 'gpu' else 1
                    # To flush out a possible race condition, run multiple times.
                    for _ in range(20):
                        # Create enough samples such that we get a meaningful distribution.
                        shape = (200, 200)
                        params = { 'low': -1.5, 'high': 3.0 }
                        params.update(shape=shape, dtype=dtype)

                        # Check imperative. `uniform` uses parallel rng.
                        seed = set_seed_variously_for_context(ctx, 1, num_temp_seeds, seed_to_test)
                        rnds = mx.nd.random.uniform(**params)
                        samples_imp.append(rnds.asnumpy())

                        # Check symbolic. `uniform` uses parallel rng.
                        X = mx.sym.Variable("X")
                        Y = mx.sym.random.uniform(**params) + X
                        x = mx.nd.zeros(shape, dtype=dtype)
                        xgrad = mx.nd.zeros(shape, dtype=dtype)
                        yexec = Y.bind(ctx, {'X' : x}, {'X': xgrad})
                        set_seed_variously_for_context(ctx, seed, num_temp_seeds, seed_to_test)
                        yexec.forward(is_train=True)
                        yexec.backward(yexec.outputs[0])
                        samples_sym.append(yexec.outputs[0].asnumpy())
            except mx.MXNetError as e:
                if str(e).find("invalid device ordinal") != -1:
                    break
                else:
                    raise e
        # The samples should be identical across different gpu devices.
        for i in range(1, len(samples_imp)):
            assert same(samples_imp[i - 1], samples_imp[i])
        for i in range(1, len(samples_sym)):
            assert same(samples_sym[i - 1], samples_sym[i])

Python mxnet.MXNetError() Examples