Python cupy.RawKernel() Examples

The following are 8 code examples of cupy.RawKernel(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module cupy , or try the search function .
Example #1
Source File: non_maximum_suppression.py    From chainer-compiler with MIT License 6 votes vote down vote up
def _call_nms_kernel(bbox, thresh):
    assert False, "Not supported."
    n_bbox = bbox.shape[0]
    threads_per_block = 64
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
    blocks = (col_blocks, col_blocks, 1)
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
    kern = cp.RawKernel(_nms_gpu_code, 'nms_kernel')
    kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
                                bbox, mask_dev))

    mask_host = mask_dev.get()
    selection, n_selec = _nms_gpu_post(
        mask_host, n_bbox, threads_per_block, col_blocks)
    return selection, n_selec 
Example #2
Source File: cuda.py    From chainer with MIT License 6 votes vote down vote up
def raw(code, name, *args, **kwargs):
    """Creates a raw kernel function.

    This function uses :func:`~chainer.backends.cuda.memoize` to cache the
    resulting kernel object, i.e. the resulting kernel object is cached for
    each argument combination and CUDA device.

    The arguments are the same as those for :class:`cupy.RawKernel`.

    """
    check_cuda_available()
    return cupy.RawKernel(code, name, *args, **kwargs)


# ------------------------------------------------------------------------------
# numpy/cupy compatible coding
# ------------------------------------------------------------------------------ 
Example #3
Source File: test_raw.py    From cupy with MIT License 6 votes vote down vote up
def setUp(self):
        self.dev = cupy.cuda.runtime.getDevice()
        assert self.dev != 1

        global _test_cache_dir
        _test_cache_dir = tempfile.mkdtemp()
        os.environ['CUPY_CACHE_DIR'] = _test_cache_dir

        self.kern = cupy.RawKernel(
            _test_source1, 'test_sum',
            backend=self.backend)
        self.mod2 = cupy.RawModule(
            code=_test_source2,
            backend=self.backend)
        self.mod3 = cupy.RawModule(
            code=_test_source3,
            options=('-DPRECISION=2',),
            backend=self.backend) 
Example #4
Source File: non_maximum_suppression.py    From chainercv with MIT License 6 votes vote down vote up
def _call_nms_kernel(bbox, thresh):
    n_bbox = bbox.shape[0]
    threads_per_block = 64
    col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
    blocks = (col_blocks, col_blocks, 1)
    threads = (threads_per_block, 1, 1)

    mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
    bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
    kern = cp.RawKernel(_nms_gpu_code, 'nms_kernel')
    kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
                                bbox, mask_dev))

    mask_host = mask_dev.get()
    selection, n_selec = _nms_gpu_post(
        mask_host, n_bbox, threads_per_block, col_blocks)
    return selection, n_selec 
Example #5
Source File: sgemm.py    From cupy with MIT License 5 votes vote down vote up
def sgemm(A, B,
          dim_x=16, dim_y=16, blk_m=64, blk_n=64, blk_k=4,
          dim_xa=64, dim_ya=4, dim_xb=4, dim_yb=64):
    assert A.dtype == cp.float32
    assert B.dtype == cp.float32
    assert(dim_x * dim_y == dim_xa * dim_ya == dim_xb * dim_yb)

    m, k = A.shape
    k, n = B.shape

    # Inputs matrices need to be in Fortran order.
    A = cp.asfortranarray(A)
    B = cp.asfortranarray(B)

    C = cp.empty((m, n), dtype=cp.float32, order='F')

    config = {'DIM_X': dim_x, 'DIM_Y': dim_y,
              'BLK_M': blk_m, 'BLK_N': blk_n, 'BLK_K': blk_k,
              'DIM_XA': dim_xa, 'DIM_YA': dim_ya,
              'DIM_XB': dim_xb, 'DIM_YB': dim_yb,
              'THR_M': blk_m // dim_x, 'THR_N': blk_n // dim_y}
    code = read_code(sgemm_file, params=config)
    kern = cp.RawKernel(code, 'sgemm')

    grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1)
    block = (dim_x, dim_y, 1)
    args = (m, n, k, A, B, C)
    shared_mem = blk_k * (blk_m + 1) * 4 + blk_n * (blk_k + 1) * 4
    kern(grid, block, args=args, shared_mem=shared_mem)
    return C 
Example #6
Source File: test_raw.py    From cupy with MIT License 5 votes vote down vote up
def test_dynamical_parallelism(self):
        ker = cupy.RawKernel(_test_source4, 'test_kernel', options=('-dc',),
                             backend=self.backend)
        N = 169
        inner_chunk = 13
        x = cupy.zeros((N,), dtype=cupy.float32)
        ker((1,), (N//inner_chunk,), (x, N, inner_chunk))
        assert (x == 1.0).all() 
Example #7
Source File: test_raw.py    From cupy with MIT License 5 votes vote down vote up
def test_dynamical_parallelism_compile_failure(self):
        # no option for separate compilation is given should cause an error
        ker = cupy.RawKernel(_test_source4, 'test_kernel',
                             backend=self.backend)
        N = 10
        inner_chunk = 2
        x = cupy.zeros((N,), dtype=cupy.float32)
        if self.backend == 'nvrtc':
            # raised when calling ls.complete()
            with pytest.raises(cupy.cuda.driver.CUDADriverError):
                ker((1,), (N//inner_chunk,), (x, N, inner_chunk))
        else:  # nvcc
            with pytest.raises(cupy.cuda.compiler.CompileException):
                ker((1,), (N//inner_chunk,), (x, N, inner_chunk)) 
Example #8
Source File: test_raw.py    From cupy with MIT License 5 votes vote down vote up
def test_context_switch_RawKernel(self):
        # run test_basic() on another device

        # For RawKernel, we need to launch it once to force compiling
        x1, x2, y = self._helper(self.kern, cupy.float32)

        with cupy.cuda.Device(1):
            x1, x2, y = self._helper(self.kern, cupy.float32)
            assert cupy.allclose(y, x1 + x2)