Python cupy.RawKernel() Examples
The following are 8
code examples of cupy.RawKernel().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
cupy
, or try the search function
.
Example #1
Source File: non_maximum_suppression.py From chainer-compiler with MIT License | 6 votes |
def _call_nms_kernel(bbox, thresh): assert False, "Not supported." n_bbox = bbox.shape[0] threads_per_block = 64 col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) blocks = (col_blocks, col_blocks, 1) threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64) bbox = cp.ascontiguousarray(bbox, dtype=np.float32) kern = cp.RawKernel(_nms_gpu_code, 'nms_kernel') kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), bbox, mask_dev)) mask_host = mask_dev.get() selection, n_selec = _nms_gpu_post( mask_host, n_bbox, threads_per_block, col_blocks) return selection, n_selec
Example #2
Source File: cuda.py From chainer with MIT License | 6 votes |
def raw(code, name, *args, **kwargs): """Creates a raw kernel function. This function uses :func:`~chainer.backends.cuda.memoize` to cache the resulting kernel object, i.e. the resulting kernel object is cached for each argument combination and CUDA device. The arguments are the same as those for :class:`cupy.RawKernel`. """ check_cuda_available() return cupy.RawKernel(code, name, *args, **kwargs) # ------------------------------------------------------------------------------ # numpy/cupy compatible coding # ------------------------------------------------------------------------------
Example #3
Source File: test_raw.py From cupy with MIT License | 6 votes |
def setUp(self): self.dev = cupy.cuda.runtime.getDevice() assert self.dev != 1 global _test_cache_dir _test_cache_dir = tempfile.mkdtemp() os.environ['CUPY_CACHE_DIR'] = _test_cache_dir self.kern = cupy.RawKernel( _test_source1, 'test_sum', backend=self.backend) self.mod2 = cupy.RawModule( code=_test_source2, backend=self.backend) self.mod3 = cupy.RawModule( code=_test_source3, options=('-DPRECISION=2',), backend=self.backend)
Example #4
Source File: non_maximum_suppression.py From chainercv with MIT License | 6 votes |
def _call_nms_kernel(bbox, thresh): n_bbox = bbox.shape[0] threads_per_block = 64 col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) blocks = (col_blocks, col_blocks, 1) threads = (threads_per_block, 1, 1) mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64) bbox = cp.ascontiguousarray(bbox, dtype=np.float32) kern = cp.RawKernel(_nms_gpu_code, 'nms_kernel') kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), bbox, mask_dev)) mask_host = mask_dev.get() selection, n_selec = _nms_gpu_post( mask_host, n_bbox, threads_per_block, col_blocks) return selection, n_selec
Example #5
Source File: sgemm.py From cupy with MIT License | 5 votes |
def sgemm(A, B, dim_x=16, dim_y=16, blk_m=64, blk_n=64, blk_k=4, dim_xa=64, dim_ya=4, dim_xb=4, dim_yb=64): assert A.dtype == cp.float32 assert B.dtype == cp.float32 assert(dim_x * dim_y == dim_xa * dim_ya == dim_xb * dim_yb) m, k = A.shape k, n = B.shape # Inputs matrices need to be in Fortran order. A = cp.asfortranarray(A) B = cp.asfortranarray(B) C = cp.empty((m, n), dtype=cp.float32, order='F') config = {'DIM_X': dim_x, 'DIM_Y': dim_y, 'BLK_M': blk_m, 'BLK_N': blk_n, 'BLK_K': blk_k, 'DIM_XA': dim_xa, 'DIM_YA': dim_ya, 'DIM_XB': dim_xb, 'DIM_YB': dim_yb, 'THR_M': blk_m // dim_x, 'THR_N': blk_n // dim_y} code = read_code(sgemm_file, params=config) kern = cp.RawKernel(code, 'sgemm') grid = (int(math.ceil(m / blk_m)), int(math.ceil(n / blk_n)), 1) block = (dim_x, dim_y, 1) args = (m, n, k, A, B, C) shared_mem = blk_k * (blk_m + 1) * 4 + blk_n * (blk_k + 1) * 4 kern(grid, block, args=args, shared_mem=shared_mem) return C
Example #6
Source File: test_raw.py From cupy with MIT License | 5 votes |
def test_dynamical_parallelism(self): ker = cupy.RawKernel(_test_source4, 'test_kernel', options=('-dc',), backend=self.backend) N = 169 inner_chunk = 13 x = cupy.zeros((N,), dtype=cupy.float32) ker((1,), (N//inner_chunk,), (x, N, inner_chunk)) assert (x == 1.0).all()
Example #7
Source File: test_raw.py From cupy with MIT License | 5 votes |
def test_dynamical_parallelism_compile_failure(self): # no option for separate compilation is given should cause an error ker = cupy.RawKernel(_test_source4, 'test_kernel', backend=self.backend) N = 10 inner_chunk = 2 x = cupy.zeros((N,), dtype=cupy.float32) if self.backend == 'nvrtc': # raised when calling ls.complete() with pytest.raises(cupy.cuda.driver.CUDADriverError): ker((1,), (N//inner_chunk,), (x, N, inner_chunk)) else: # nvcc with pytest.raises(cupy.cuda.compiler.CompileException): ker((1,), (N//inner_chunk,), (x, N, inner_chunk))
Example #8
Source File: test_raw.py From cupy with MIT License | 5 votes |
def test_context_switch_RawKernel(self): # run test_basic() on another device # For RawKernel, we need to launch it once to force compiling x1, x2, y = self._helper(self.kern, cupy.float32) with cupy.cuda.Device(1): x1, x2, y = self._helper(self.kern, cupy.float32) assert cupy.allclose(y, x1 + x2)