Python pycuda.compiler.SourceModule() Examples
The following are 30
code examples of pycuda.compiler.SourceModule().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
pycuda.compiler
, or try the search function
.
Example #1
Source File: eclat.py From Apriori-and-Eclat-Frequent-Itemset-Mining with MIT License | 6 votes |
def __init__(self, num_trans, min_support, use_CUDA, block, thread, use_optimal=True): self.num_trans = num_trans self.min_support = min_support * num_trans self.support_list = {} self.use_CUDA = use_CUDA self.use_optimal = use_optimal if self.use_CUDA and not self.use_optimal: assert block != None and thread != None mod = SourceModule("""__global__ void multiply_element(int *dest, int *a, int *b) { const int idx = threadIdx.x + blockDim.x * blockIdx.x; dest[idx] = a[idx] * b[idx]; }""") self.multiply = mod.get_function("multiply_element") self.block = (block, thread, 1) dx, mx = divmod(self.num_trans, self.block[0]) dy, my = divmod(1, self.block[1]) self.grid = (int(dx + (mx>0)), int(dy + (my>0))) print("Using Block =", self.block) print("Using Grid =", self.grid) elif self.use_CUDA: print("Accelerating Eclat computation with GPU!") else: print("Not using GPU for acceleration.")
Example #2
Source File: gpu.py From Jamais-Vu with MIT License | 6 votes |
def maximum_filter_2d(arr2D, footprint): ## Make sure arr2D is our datatype float32 and footprint of int32 arr2DMaxed = numpy.empty_like(arr2D) head, tail = os.path.split(os.path.abspath(__file__)) # Used so that we can always get the kernel which should be in the same directory as this file maxFunction = open(head + "/2DSlidingMaxFootprintKernel.c", "rt") maxFunction = SourceModule(maxFunction.read()) slidingMaxKernel = maxFunction.get_function("slidingMaxiumum2D") blockSize = [16, 16] # To-do: Add a variable to this, can affect performance based on GPU gridSize = getGridSize(blockSize, arr2D.shape) # Get the size of our grid based on the size of a grid (blocksize) slidingMaxKernel(cuda.In(arr2D), # Input cuda.Out(arr2DMaxed), # Output numpy.int32(footprint.shape[1]), # Kernel Size numpy.int32(arr2D.shape[1]), # Row Stride numpy.int32(1), # Column Stride numpy.int32(int(arr2D.shape[1])), # Array Column Count numpy.int32(int(arr2D.shape[0])), # Array Row Count cuda.In(footprint), block=(blockSize[0],blockSize[1],1), grid=(gridSize[0],gridSize[1],1) ) return arr2DMaxed
Example #3
Source File: pycuda_double_op.py From attention-lvcsr with MIT License | 6 votes |
def make_thunk(self, node, storage_map, _, _2): mod = SourceModule(""" __global__ void my_fct(float * i0, float * o0, int size) { int i = blockIdx.x*blockDim.x + threadIdx.x; if(i<size){ o0[i] = i0[i]*2; } }""") pycuda_fct = mod.get_function("my_fct") inputs = [ storage_map[v] for v in node.inputs] outputs = [ storage_map[v] for v in node.outputs] def thunk(): z = outputs[0] if z[0] is None or z[0].shape!=inputs[0][0].shape: z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape) grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1) pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size), block=(512,1,1), grid=grid) return thunk
Example #4
Source File: test_pycuda_theano_simple.py From attention-lvcsr with MIT License | 6 votes |
def test_pycuda_theano(): """Simple example with pycuda function and Theano CudaNdarray object.""" from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(100).astype(numpy.float32) b = numpy.random.randn(100).astype(numpy.float32) # Test with Theano object ga = cuda_ndarray.CudaNdarray(a) gb = cuda_ndarray.CudaNdarray(b) dest = cuda_ndarray.CudaNdarray.zeros(a.shape) multiply_them(dest, ga, gb, block=(400, 1, 1), grid=(1, 1)) assert (numpy.asarray(dest) == a * b).all()
Example #5
Source File: test_pycuda_theano_simple.py From D-VAE with MIT License | 6 votes |
def test_pycuda_only(): """Run pycuda only example to test that pycuda works.""" from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") # Test with pycuda in/out of numpy.ndarray a = numpy.random.randn(100).astype(numpy.float32) b = numpy.random.randn(100).astype(numpy.float32) dest = numpy.zeros_like(a) multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1), grid=(1, 1)) assert (dest == a * b).all()
Example #6
Source File: test_pycuda_theano_simple.py From D-VAE with MIT License | 6 votes |
def test_pycuda_theano(): """Simple example with pycuda function and Theano CudaNdarray object.""" from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(100).astype(numpy.float32) b = numpy.random.randn(100).astype(numpy.float32) # Test with Theano object ga = cuda_ndarray.CudaNdarray(a) gb = cuda_ndarray.CudaNdarray(b) dest = cuda_ndarray.CudaNdarray.zeros(a.shape) multiply_them(dest, ga, gb, block=(400, 1, 1), grid=(1, 1)) assert (numpy.asarray(dest) == a * b).all()
Example #7
Source File: pycuda_double_op.py From D-VAE with MIT License | 6 votes |
def make_thunk(self, node, storage_map, _, _2): mod = SourceModule(""" __global__ void my_fct(float * i0, float * o0, int size) { int i = blockIdx.x*blockDim.x + threadIdx.x; if(i<size){ o0[i] = i0[i]*2; } }""") pycuda_fct = mod.get_function("my_fct") inputs = [ storage_map[v] for v in node.inputs] outputs = [ storage_map[v] for v in node.outputs] def thunk(): z = outputs[0] if z[0] is None or z[0].shape!=inputs[0][0].shape: z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape) grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1) pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size), block=(512,1,1), grid=grid) return thunk
Example #8
Source File: decompress_cuda.py From pycbc with GNU General Public License v3.0 | 6 votes |
def get_dckernel(slen): # Right now, hardcoding the number of threads per block nt = 1024 nb = int(numpy.ceil(slen / 1024.0)) if nb > 1024: raise ValueError("More than 1024 blocks not supported yet") try: return dckernel_cache[nb] except KeyError: mod = SourceModule(kernel_sources.render(ntpb=nt, nblocks=nb)) freq_tex = mod.get_texref("freq_tex") amp_tex = mod.get_texref("amp_tex") phase_tex = mod.get_texref("phase_tex") fn1 = mod.get_function("find_block_indices") fn1.prepare("PPifff", texrefs=[freq_tex]) fn2 = mod.get_function("linear_interp") fn2.prepare("PfiffiPP", texrefs=[freq_tex, amp_tex, phase_tex]) dckernel_cache[nb] = (fn1, fn2, freq_tex, amp_tex, phase_tex, nt, nb) return dckernel_cache[nb]
Example #9
Source File: test_pycuda_theano_simple.py From attention-lvcsr with MIT License | 6 votes |
def test_pycuda_only(): """Run pycuda only example to test that pycuda works.""" from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") # Test with pycuda in/out of numpy.ndarray a = numpy.random.randn(100).astype(numpy.float32) b = numpy.random.randn(100).astype(numpy.float32) dest = numpy.zeros_like(a) multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1), grid=(1, 1)) assert (dest == a * b).all()
Example #10
Source File: binary.py From neon with Apache License 2.0 | 6 votes |
def pack_rows(): code = pack() + r""" __global__ void pack_rows(float *a, unsigned int *b, int size) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) { b[i] = pack(&a[i * 32]); } } """ module = SourceModule(code) kernel = module.get_function("pack_rows") sig = "2P I" kernel.prepare(sig) return kernel
Example #11
Source File: binary.py From neon with Apache License 2.0 | 5 votes |
def shift(): code = shift_element() + r""" __global__ void shift( float *a, float *b, float *c, bool value, int sizea, int b_rows, int b_cols) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < sizea) { float bi; if (b_rows > 1 && b_cols > 1) { bi = b[i]; } else if (b_rows > 1 && b_cols == 1) { int step = sizea/b_rows; bi = b[i/step]; } else if (b_rows == 1 && b_cols > 1) { bi = b[i % b_cols]; } else if (b_rows == 1 && b_cols == 1) { bi = b[0]; } c[i] = shift_element(a[i], bi, value); } } """ module = SourceModule(code) kernel = module.get_function("shift") sig = "3P 4I" kernel.prepare(sig) return kernel
Example #12
Source File: monte_carlo_integrator.py From Hands-On-GPU-Programming-with-Python-and-CUDA with MIT License | 5 votes |
def __init__(self, math_function='y = sin(x)', precision='d', lo=0, hi=np.pi, samples_per_thread=10**5, num_blocks=100): self.math_function = math_function if precision in [None, 's', 'S', 'single', np.float32]: self.precision = 'float' self.numpy_precision = np.float32 self.p_curand = '' elif precision in ['d','D', 'double', np.float64]: self.precision = 'double' self.numpy_precision = np.float64 self.p_curand = '_double' else: raise Exception('precision is invalid datatype!') if (hi - lo <= 0): raise Exception('hi - lo <= 0!') else: self.hi = hi self.lo = lo MonteCarloDict = {'p' : self.precision, 'p_curand' : self.p_curand, 'math_function' : self.math_function} self.MonteCarloCode = MonteCarloKernelTemplate % MonteCarloDict self.ker = SourceModule(no_extern_c=True , options=['-w'], source=self.MonteCarloCode) self.f = self.ker.get_function('monte_carlo') self.num_blocks = num_blocks self.samples_per_thread = samples_per_thread
Example #13
Source File: multi-kernel_multi-thread.py From Hands-On-GPU-Programming-with-Python-and-CUDA with MIT License | 5 votes |
def run(self): self.dev = drv.Device(0) self.context = self.dev.make_context() self.ker = SourceModule(kernel_code) self.mult_ker = self.ker.get_function('mult_ker') self.array_gpu = gpuarray.to_gpu(self.input_array) self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1)) self.output_array = self.array_gpu.get() self.context.pop()
Example #14
Source File: binary.py From neon with Apache License 2.0 | 5 votes |
def pack_cols(): code = r""" __global__ void pack_cols(float *a, unsigned int *b, int m, int n) { int j = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.y * blockDim.y + threadIdx.y; if (j < n && 32 * i < m) { float num; unsigned int rvalue = 0; unsigned int sign; for(int k = 0; k < 32; k++) { num = a[j + n * (32 * i + k)]; sign = (num >= 0); rvalue = rvalue | (sign << k); } b[j + n * i] = rvalue; } } """ module = SourceModule(code) kernel = module.get_function("pack_cols") sig = "2P 2I" kernel.prepare(sig) return kernel
Example #15
Source File: PatchMatchCuda.py From pyLucid with MIT License | 5 votes |
def propagate(self, iters=2, rand_search_radius=500): """ Optimize the NNF using PatchMatch Algorithm :param iters: number of iterations :param rand_search_radius: max radius to use in random search :return: """ mod = SourceModule(open(os.path.join(package_directory,"patchmatch.cu")).read(),no_extern_c=True) patchmatch = mod.get_function("patch_match") rows = self.A.shape[0] cols = self.A.shape[1] channels = np.int32(self.A.shape[2]) nnf_t = np.zeros(shape=(rows,cols),dtype=np.uint32) threads = 20 def get_blocks_for_dim(dim,blocks): #if dim % blocks ==0: # return dim//blocks return dim// blocks +1 patchmatch( drv.In(self.A), drv.In(self.AA), drv.In(self.B), drv.In(self.BB), drv.InOut(self.nnf), drv.InOut(nnf_t), drv.InOut(self.nnd), np.int32(rows), np.int32(cols), channels, np.int32(self.patch_size), np.int32(iters), np.int32(8), np.int32(rand_search_radius), block=(threads,threads,1), grid=(get_blocks_for_dim(rows,threads), get_blocks_for_dim(cols,threads)))
Example #16
Source File: threshold_cuda.py From pycbc with GNU General Public License v3.0 | 5 votes |
def get_tkernel(slen, window): if window < 32: raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples") elif window <= 4096: nt = 128 elif window <= 16384: nt = 256 elif window <= 32768: nt = 512 else: nt = 1024 nb = int(numpy.ceil(slen / float(window))) if nb > 1024: raise ValueError("More than 1024 blocks not supported yet") try: return tfn_cache[(nt, nb)], nt, nb except KeyError: mod = SourceModule(tkernel1.render(chunk=nt)) mod2 = SourceModule(tkernel2.render(blocks=nb)) fn = mod.get_function("threshold_and_cluster") fn.prepare("PPPif") fn2 = mod2.get_function("threshold_and_cluster2") fn2.prepare("PPfi") tfn_cache[(nt, nb)] = (fn, fn2) return tfn_cache[(nt, nb)], nt, nb
Example #17
Source File: chisq_cuda.py From pycbc with GNU General Public License v3.0 | 5 votes |
def get_pchisq_fn(np, fuse_correlate=False): if np not in _pchisq_cache: nt = 256 mod = SourceModule(chisqkernel.render(NT=nt, NP=np, fuse=fuse_correlate)) fn = mod.get_function("power_chisq_at_points_%s" % (np)) if fuse_correlate: fn.prepare("PPPI" + "f" * np + "PPPI") else: fn.prepare("PPI" + "f" * np + "PPPI") _pchisq_cache[np] = (fn, nt) return _pchisq_cache[np]
Example #18
Source File: chisq_cuda.py From pycbc with GNU General Public License v3.0 | 5 votes |
def get_pchisq_fn_pow2(np, fuse_correlate=False): if np not in _pchisq_cache_pow2: nt = 256 mod = SourceModule(chisqkernel_pow2.render(NT=nt, NP=np, fuse=fuse_correlate)) fn = mod.get_function("power_chisq_at_points_%s_pow2" % (np)) if fuse_correlate: fn.prepare("PPPI" + "I" * np + "PPPI") else: fn.prepare("PPI" + "I" * np + "PPPI") _pchisq_cache_pow2[np] = (fn, nt) return _pchisq_cache_pow2[np]
Example #19
Source File: cuda_codegen.py From ProxImaL with MIT License | 5 votes |
def compile_cuda_kernel(cuda_kernel_code): """ compiles a cuda kernel and return compiled module """ try: cuda_code = cuda_kernel_code if 1 else replace_local_floats_with_double(cuda_kernel_code) logging.debug("Compiling cuda code:\n" + cuda_code) mod = SourceModule(cuda_code, options=DEFAULT_NVCC_FLAGS + ['--use_fast_math']) except cuda.CompileError as e: logging.error(cuda_code) logging.error("CUDA compilation error:") logging.error(e.stderr) raise e return mod
Example #20
Source File: pycuda_example.py From attention-lvcsr with MIT License | 5 votes |
def make_node(self, *inputs): _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs] if self.nin > 0 and len(_inputs) != self.nin: raise TypeError('Wrong argument count', (self.nin, len(_inputs))) for i in _inputs[1:]: if i.type.ndim != inputs[0].type.ndim: raise TypeError('different ranks among inputs') if any([any(i.type.broadcastable) for i in inputs]): raise Exception("pycuda don't support broadcasted dimensions") assert len(inputs) == 2 # TODO remove otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim) assert self.nout == 1 fct_name = "pycuda_elemwise_%s" % str(self.scalar_op) out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)]) in_name = ["i" + str(id) for id in range(len(inputs))] out_name = ["o" + str(id) for id in range(self.nout)] c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n + "[i]" for n in in_name]), tuple(n + "[i]" for n in out_name), {}) c_code_param = ", ".join( [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name for var, name in chain(izip(inputs, in_name), izip(out_node.outputs, out_name))] + ["int size"]) mod = SourceModule(""" __global__ void %s(%s) { int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y); i += threadIdx.x + threadIdx.y*blockDim.x; if(i<size){ %s } } """ % (fct_name, c_code_param, c_code)) self.pycuda_fct = mod.get_function(fct_name) return out_node
Example #21
Source File: flex_conv.py From ngraph-python with Apache License 2.0 | 5 votes |
def _get_shuffle_kernel(dtype): code = _shuffle_kernel % { "type": _get_register_type(dtype, memory=True) } module = SourceModule(code) kernel = module.get_function("dimShuffle") kernel.prepare("PPIIIIIIIIIIIIII") return kernel
Example #22
Source File: flex_conv.py From ngraph-python with Apache License 2.0 | 5 votes |
def _get_transpose_kernel(dtype): code = _transpose_kernel % { "type": _get_register_type(dtype, memory=True) } module = SourceModule(code) kernel = module.get_function("transpose") kernel.prepare("PPII") return kernel
Example #23
Source File: float_ew2.py From ngraph-python with Apache License 2.0 | 5 votes |
def _prepare_compound_kernel(transformer, ops): """ Generate and return a kernel given a set of ops. ops (list): List of tuples describing ops to execute in kernel. Each tuple should be of the format (op_name, input0, input1, output, axis) """ # Take care tensor dimensionality ops = _wrap_tensor_descriptions(transformer, ops) # Generate kernel source code and block/grid mapping (axes_mapping, dims) = _get_axes_mapping(ops) code, kernel_name, arg_desc, params = _get_compound_kernel(ops, axes_mapping, dims) # Compile kernel if _are_flex_params(params): code = _includes_template + _flex_includes_template + code else: code = _includes_template + code module = SourceModule(code, options=[]) kernel = module.get_function(kernel_name) kernel.name = kernel_name kernel.prepare(arg_desc) # Calculate block and grid dims blockdim = [1, 1, 1] griddim = [1, 1, 1] for axis in axes_mapping: if axis[0] == 'x': blockdim[0] = axis[1] griddim[0] = axis[2] elif axis[0] == 'y': blockdim[1] = axis[1] griddim[1] = axis[2] elif axis[0] == 'z': blockdim[2] = axis[1] griddim[2] = axis[2] params = [tuple(griddim), tuple(blockdim), None] + params return (kernel, params, 128)
Example #24
Source File: pycuda_example.py From D-VAE with MIT License | 5 votes |
def make_node(self, *inputs): _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs] if self.nin > 0 and len(_inputs) != self.nin: raise TypeError('Wrong argument count', (self.nin, len(_inputs))) for i in _inputs[1:]: if i.type.ndim != inputs[0].type.ndim: raise TypeError('different ranks among inputs') if any([any(i.type.broadcastable) for i in inputs]): raise Exception("pycuda don't support broadcasted dimensions") assert len(inputs) == 2 # TODO remove otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim) assert self.nout == 1 fct_name = "pycuda_elemwise_%s" % str(self.scalar_op) out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)]) in_name = ["i" + str(id) for id in range(len(inputs))] out_name = ["o" + str(id) for id in range(self.nout)] c_code = self.scalar_op.c_code(out_node, "some_name", tuple([n + "[i]" for n in in_name]), tuple(n + "[i]" for n in out_name), {}) c_code_param = ", ".join( [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name for var, name in chain(izip(inputs, in_name), izip(out_node.outputs, out_name))] + ["int size"]) mod = SourceModule(""" __global__ void %s(%s) { int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y); i += threadIdx.x + threadIdx.y*blockDim.x; if(i<size){ %s } } """ % (fct_name, c_code_param, c_code)) self.pycuda_fct = mod.get_function(fct_name) return out_node
Example #25
Source File: m_div_eigenenergy_cuda.py From pyscf with Apache License 2.0 | 5 votes |
def div_eigenenergy_cuda(ksn2e, ksn2f, nfermi, vstart, comega, nm2v_re, nm2v_im, block_size, grid_size): block = (int(block_size[0]), int(block_size[1]), int(1)) grid = (int(grid_size[0]), int(grid_size[1])) mod = SourceModule(kernel_code_div_eigenenergy_cuda) calc_XXVV = mod.get_function("calc_XXVV_gpu") calc_XXVV(nm2v_re, nm2v_im, np.int32(nm2v_re.shape[0]), np.int32(nm2v_re.shape[1]), ksn2e, ksn2f, np.int32(nfermi), np.int32(vstart), np.int32(ksn2e.shape[0]), np.float64(comega.real), np.float64(comega.imag), block = block, grid = grid)
Example #26
Source File: dihedral_ops.py From kaggle-ndsb with MIT License | 4 votes |
def make_thunk(self, node, storage_map, _, _2): inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] mod = SourceModule(""" __global__ void cyclic_roll(float * input, float * output, int batch_size, int num_features) { int x = blockIdx.x*blockDim.x + threadIdx.x; // feature dim, fastest varying index! int y = blockIdx.y*blockDim.y + threadIdx.y; // batch dim int height = 4 * batch_size; int width = 4 * num_features; if (x < num_features && y < height) { for (int i = 0; i < 4; i++) { int y_out = (y + batch_size * (4 - i)) % height; int x_out = x + num_features * i; output[y_out * width + x_out] = input[y * num_features + x]; } } }""") kernel = mod.get_function("cyclic_roll") def thunk(): in_shape = inputs[0][0].shape rows, cols = in_shape assert rows % 4 == 0 out_shape = (rows, 4 * cols) batch_size = rows // 4 num_features = cols out = outputs[0] # only allocate if there is no previous allocation of the right size. if out[0] is None or out[0].shape != out_shape: out[0] = cuda.CudaNdarray.zeros(out_shape) x_block = 16 y_block = 16 block = (x_block, y_block, 1) x_grid = int(np.ceil(float(in_shape[1]) / x_block)) y_grid = int(np.ceil(float(in_shape[0]) / y_block)) grid = (x_grid, y_grid, 1) kernel(inputs[0][0], out[0], np.intc(batch_size), np.intc(num_features), block=block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk
Example #27
Source File: dihedral_ops.py From kaggle-ndsb with MIT License | 4 votes |
def make_thunk(self, node, storage_map, _, _2): inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] mod = SourceModule(""" __global__ void cyclic_roll_grad(float * input, float * output, int batch_size, int num_features) { int x = blockIdx.x*blockDim.x + threadIdx.x; // feature dim, fastest varying index! int y = blockIdx.y*blockDim.y + threadIdx.y; // batch dim int height = 4 * batch_size; int width = 4 * num_features; float val = 0; if (x < num_features && y < height) { for (int i = 0; i < 4; i++) { int y_in = (y + batch_size * (4 - i)) % height; int x_in = x + num_features * i; val += input[y_in * width + x_in]; } output[y * num_features + x] = val; } }""") kernel = mod.get_function("cyclic_roll_grad") def thunk(): in_shape = inputs[0][0].shape rows, cols = in_shape assert rows % 4 == 0 assert cols % 4 == 0 out_shape = (rows, cols // 4) batch_size = rows // 4 num_features = cols // 4 out = outputs[0] # only allocate if there is no previous allocation of the right size. if out[0] is None or out[0].shape != out_shape: out[0] = cuda.CudaNdarray.zeros(out_shape) x_block = 16 y_block = 16 block = (x_block, y_block, 1) x_grid = int(np.ceil(float(out_shape[1]) / x_block)) y_grid = int(np.ceil(float(out_shape[0]) / y_block)) grid = (x_grid, y_grid, 1) kernel(inputs[0][0], out[0], np.intc(batch_size), np.intc(num_features), block=block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk
Example #28
Source File: pycuda_example.py From attention-lvcsr with MIT License | 4 votes |
def make_thunk(self, node, storage_map, _, _2): # TODO support broadcast! # TODO assert all input have the same shape fct_name = "pycuda_elemwise_%s" % str(self.scalar_op) in_name = ["i" + str(id) for id in range(len(node.inputs))] out_name = ["o" + str(id) for id in range(self.nout)] c_code = self.scalar_op.c_code(node, "some_name", tuple([n + "[i]" for n in in_name]), tuple(n + "[i]" for n in out_name), {}) c_code_param = ", ".join( [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name for var, name in chain(izip(node.inputs, in_name), izip(node.outputs, out_name))] + ["int size"]) mod = SourceModule(""" __global__ void %s(%s) { int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y); i += threadIdx.x + threadIdx.y*blockDim.x; if(i<size){ %s } } """ % (fct_name, c_code_param, c_code)) pycuda_fct = mod.get_function(fct_name) inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] def thunk(): z = outputs[0] if (z[0] is None or z[0].shape != inputs[0][0].shape or not z[0].is_c_contiguous()): z[0] = theano.sandbox.cuda.CudaNdarray.zeros( inputs[0][0].shape) if inputs[0][0].shape != inputs[1][0].shape: raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:" " inputs don't have the same shape!") if inputs[0][0].size > 512: grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1) block = (512, 1, 1) else: grid = (1, 1) block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1) pycuda_fct(inputs[0][0], inputs[1][0], z[0], numpy.intc(inputs[1][0].size), block=block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk
Example #29
Source File: flex_conv.py From ngraph-python with Apache License 2.0 | 4 votes |
def _get_convert_kernel(dtype): _convert_kernel = r""" #include <cuda_fp16.h> __device__ short iabs(short a) { return (a < 0) ? (-a) : a; } __global__ void convert(short* out, const %(type)s* in, int dim, float scale, int* flex_data) { int offset = blockIdx.x * dim; int max_val = 0; for(int item = threadIdx.x; item < dim; item += 32) { %(type)s value = in[offset + item]; short result = (short)(%(cvt)s(value) * scale); max_val = max((int)iabs(result), max_val); out[offset + item] = result; } atomicMax(flex_data, max_val); } """ if dtype == "f4": template_vals = { "type": "float", "cvt": "", } elif dtype == "f2": template_vals = { "type": "unsigned short", "cvt": "__half2float" } else: raise ValueError("Invalid conversion type") code = _convert_kernel % template_vals module = SourceModule(code) kernel = module.get_function("convert") kernel.prepare("PPIfP") return kernel
Example #30
Source File: pycuda_example.py From D-VAE with MIT License | 4 votes |
def make_thunk(self, node, storage_map, _, _2): # TODO support broadcast! # TODO assert all input have the same shape fct_name = "pycuda_elemwise_%s" % str(self.scalar_op) in_name = ["i" + str(id) for id in range(len(node.inputs))] out_name = ["o" + str(id) for id in range(self.nout)] c_code = self.scalar_op.c_code(node, "some_name", tuple([n + "[i]" for n in in_name]), tuple(n + "[i]" for n in out_name), {}) c_code_param = ", ".join( [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name for var, name in chain(izip(node.inputs, in_name), izip(node.outputs, out_name))] + ["int size"]) mod = SourceModule(""" __global__ void %s(%s) { int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y); i += threadIdx.x + threadIdx.y*blockDim.x; if(i<size){ %s } } """ % (fct_name, c_code_param, c_code)) pycuda_fct = mod.get_function(fct_name) inputs = [storage_map[v] for v in node.inputs] outputs = [storage_map[v] for v in node.outputs] def thunk(): z = outputs[0] if (z[0] is None or z[0].shape != inputs[0][0].shape or not z[0].is_c_contiguous()): z[0] = theano.sandbox.cuda.CudaNdarray.zeros( inputs[0][0].shape) if inputs[0][0].shape != inputs[1][0].shape: raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:" " inputs don't have the same shape!") if inputs[0][0].size > 512: grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1) block = (512, 1, 1) else: grid = (1, 1) block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1) pycuda_fct(inputs[0][0], inputs[1][0], z[0], numpy.intc(inputs[1][0].size), block=block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk