Python Examples of pycuda.compiler.SourceModule

Source File: eclat.py From Apriori-and-Eclat-Frequent-Itemset-Mining with MIT License

6 votes

def __init__(self, num_trans, min_support, use_CUDA, block, thread, use_optimal=True):
		self.num_trans = num_trans
		self.min_support = min_support * num_trans
		self.support_list = {}
		self.use_CUDA = use_CUDA
		self.use_optimal = use_optimal
		if self.use_CUDA and not self.use_optimal:
			assert block != None and thread != None
			mod = SourceModule("""__global__ void multiply_element(int *dest, int *a, int *b) {
								const int idx = threadIdx.x + blockDim.x * blockIdx.x;
								dest[idx] = a[idx] * b[idx];
							   }""")
			self.multiply = mod.get_function("multiply_element")
			self.block = (block, thread, 1)
			dx, mx = divmod(self.num_trans, self.block[0])
			dy, my = divmod(1, self.block[1])
			self.grid = (int(dx + (mx>0)), int(dy + (my>0)))
			print("Using Block =", self.block)
			print("Using Grid =", self.grid)
		elif self.use_CUDA:
			print("Accelerating Eclat computation with GPU!")
		else:
			print("Not using GPU for acceleration.")

Source File: gpu.py From Jamais-Vu with MIT License

6 votes

def maximum_filter_2d(arr2D, footprint): ## Make sure arr2D is our datatype float32 and footprint of int32
    arr2DMaxed = numpy.empty_like(arr2D)
    head, tail = os.path.split(os.path.abspath(__file__)) # Used so that we can always get the kernel which should be in the same directory as this file

    maxFunction = open(head + "/2DSlidingMaxFootprintKernel.c", "rt")
    maxFunction = SourceModule(maxFunction.read())
    slidingMaxKernel = maxFunction.get_function("slidingMaxiumum2D")

    blockSize = [16, 16] # To-do: Add a variable to this, can affect performance based on GPU
    gridSize = getGridSize(blockSize, arr2D.shape) # Get the size of our grid based on the size of a grid (blocksize)


    slidingMaxKernel(cuda.In(arr2D),                   # Input
                    cuda.Out(arr2DMaxed),              # Output
                    numpy.int32(footprint.shape[1]),   # Kernel Size
                    numpy.int32(arr2D.shape[1]),       # Row Stride
                    numpy.int32(1),                    # Column Stride
                    numpy.int32(int(arr2D.shape[1])),  # Array Column Count
                    numpy.int32(int(arr2D.shape[0])),  # Array Row Count
                    cuda.In(footprint),
                    block=(blockSize[0],blockSize[1],1),
                    grid=(gridSize[0],gridSize[1],1)
    )

    return arr2DMaxed

Source File: pycuda_double_op.py From attention-lvcsr with MIT License

6 votes

def make_thunk(self, node, storage_map, _, _2):
        mod = SourceModule("""
    __global__ void my_fct(float * i0, float * o0, int size) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if(i<size){
        o0[i] = i0[i]*2;
    }
  }""")
        pycuda_fct = mod.get_function("my_fct")
        inputs = [ storage_map[v] for v in node.inputs]
        outputs = [ storage_map[v] for v in node.outputs]
        def thunk():
            z = outputs[0]
            if z[0] is None or z[0].shape!=inputs[0][0].shape:
                z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
            grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
            pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
                       block=(512,1,1), grid=grid)

        return thunk

Source File: test_pycuda_theano_simple.py From attention-lvcsr with MIT License

6 votes

def test_pycuda_theano():
    """Simple example with pycuda function and Theano CudaNdarray object."""
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

    multiply_them = mod.get_function("multiply_them")

    a = numpy.random.randn(100).astype(numpy.float32)
    b = numpy.random.randn(100).astype(numpy.float32)

    # Test with Theano object
    ga = cuda_ndarray.CudaNdarray(a)
    gb = cuda_ndarray.CudaNdarray(b)
    dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
    multiply_them(dest, ga, gb,
                  block=(400, 1, 1), grid=(1, 1))
    assert (numpy.asarray(dest) == a * b).all()

Source File: test_pycuda_theano_simple.py From D-VAE with MIT License

6 votes

def test_pycuda_only():
    """Run pycuda only example to test that pycuda works."""
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

    multiply_them = mod.get_function("multiply_them")

    # Test with pycuda in/out of numpy.ndarray
    a = numpy.random.randn(100).astype(numpy.float32)
    b = numpy.random.randn(100).astype(numpy.float32)
    dest = numpy.zeros_like(a)
    multiply_them(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(400, 1, 1), grid=(1, 1))
    assert (dest == a * b).all()

Source File: test_pycuda_theano_simple.py From D-VAE with MIT License

6 votes

def test_pycuda_theano():
    """Simple example with pycuda function and Theano CudaNdarray object."""
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

    multiply_them = mod.get_function("multiply_them")

    a = numpy.random.randn(100).astype(numpy.float32)
    b = numpy.random.randn(100).astype(numpy.float32)

    # Test with Theano object
    ga = cuda_ndarray.CudaNdarray(a)
    gb = cuda_ndarray.CudaNdarray(b)
    dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
    multiply_them(dest, ga, gb,
                  block=(400, 1, 1), grid=(1, 1))
    assert (numpy.asarray(dest) == a * b).all()

Source File: pycuda_double_op.py From D-VAE with MIT License

6 votes

def make_thunk(self, node, storage_map, _, _2):
        mod = SourceModule("""
    __global__ void my_fct(float * i0, float * o0, int size) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if(i<size){
        o0[i] = i0[i]*2;
    }
  }""")
        pycuda_fct = mod.get_function("my_fct")
        inputs = [ storage_map[v] for v in node.inputs]
        outputs = [ storage_map[v] for v in node.outputs]
        def thunk():
            z = outputs[0]
            if z[0] is None or z[0].shape!=inputs[0][0].shape:
                z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
            grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
            pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
                       block=(512,1,1), grid=grid)

        return thunk

Source File: decompress_cuda.py From pycbc with GNU General Public License v3.0

6 votes

def get_dckernel(slen):
    # Right now, hardcoding the number of threads per block
    nt = 1024
    nb = int(numpy.ceil(slen / 1024.0))

    if nb > 1024:
        raise ValueError("More than 1024 blocks not supported yet")

    try:
        return dckernel_cache[nb]
    except KeyError:
        mod = SourceModule(kernel_sources.render(ntpb=nt, nblocks=nb))
        freq_tex = mod.get_texref("freq_tex")
        amp_tex = mod.get_texref("amp_tex")
        phase_tex = mod.get_texref("phase_tex")
        fn1 = mod.get_function("find_block_indices")
        fn1.prepare("PPifff", texrefs=[freq_tex])
        fn2 = mod.get_function("linear_interp")
        fn2.prepare("PfiffiPP", texrefs=[freq_tex, amp_tex, phase_tex])
        dckernel_cache[nb] = (fn1, fn2, freq_tex, amp_tex, phase_tex, nt, nb)
        return dckernel_cache[nb]

Source File: test_pycuda_theano_simple.py From attention-lvcsr with MIT License

6 votes

def test_pycuda_only():
    """Run pycuda only example to test that pycuda works."""
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

    multiply_them = mod.get_function("multiply_them")

    # Test with pycuda in/out of numpy.ndarray
    a = numpy.random.randn(100).astype(numpy.float32)
    b = numpy.random.randn(100).astype(numpy.float32)
    dest = numpy.zeros_like(a)
    multiply_them(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(400, 1, 1), grid=(1, 1))
    assert (dest == a * b).all()

Source File: binary.py From neon with Apache License 2.0

6 votes

def pack_rows():

    code = pack() + r"""
__global__ void pack_rows(float *a, unsigned int *b, int size) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < size) {
        b[i] = pack(&a[i * 32]);
    }
}
"""

    module = SourceModule(code)
    kernel = module.get_function("pack_rows")
    sig = "2P I"
    kernel.prepare(sig)
    return kernel

Source File: binary.py From neon with Apache License 2.0

5 votes

def shift():

    code = shift_element() + r"""
__global__ void shift(
    float *a, float *b, float *c, bool value, int sizea, int b_rows, int b_cols)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < sizea) {
        float bi;
        if (b_rows > 1 && b_cols > 1) {
            bi = b[i];
        } else if (b_rows > 1 && b_cols == 1) {
            int step = sizea/b_rows;
            bi = b[i/step];
        } else if (b_rows == 1 && b_cols > 1) {
            bi = b[i % b_cols];
        } else if (b_rows == 1 && b_cols == 1) {
            bi = b[0];
        }

        c[i] = shift_element(a[i], bi, value);
    }
}
"""

    module = SourceModule(code)
    kernel = module.get_function("shift")
    sig = "3P 4I"
    kernel.prepare(sig)
    return kernel

Source File: monte_carlo_integrator.py From Hands-On-GPU-Programming-with-Python-and-CUDA with MIT License

5 votes

def __init__(self, math_function='y = sin(x)', precision='d', lo=0, hi=np.pi, samples_per_thread=10**5, num_blocks=100):
        
        self.math_function = math_function
        
        if precision in [None, 's', 'S', 'single', np.float32]:
            self.precision = 'float'
            self.numpy_precision = np.float32
            self.p_curand = ''
        elif precision in ['d','D', 'double', np.float64]:
            self.precision = 'double'
            self.numpy_precision = np.float64
            self.p_curand = '_double'
        else:
            raise Exception('precision is invalid datatype!')
            
        if (hi - lo <= 0):
            raise Exception('hi - lo <= 0!')
        else:
            self.hi = hi
            self.lo = lo
              
        MonteCarloDict = {'p' : self.precision, 'p_curand' : self.p_curand, 'math_function' : self.math_function}
        
        self.MonteCarloCode = MonteCarloKernelTemplate % MonteCarloDict
        
        self.ker = SourceModule(no_extern_c=True , options=['-w'], source=self.MonteCarloCode)
        
        self.f = self.ker.get_function('monte_carlo')
        
        self.num_blocks = num_blocks
        
        self.samples_per_thread = samples_per_thread

Source File: multi-kernel_multi-thread.py From Hands-On-GPU-Programming-with-Python-and-CUDA with MIT License

5 votes

def run(self):
        self.dev = drv.Device(0)
        self.context = self.dev.make_context()
        
        self.ker = SourceModule(kernel_code)

        self.mult_ker = self.ker.get_function('mult_ker')

        self.array_gpu = gpuarray.to_gpu(self.input_array)
        
        self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))

        self.output_array = self.array_gpu.get()
        
        self.context.pop()

Source File: binary.py From neon with Apache License 2.0

5 votes

def pack_cols():

    code = r"""
__global__ void pack_cols(float *a, unsigned int *b, int m, int n) {
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int i = blockIdx.y * blockDim.y + threadIdx.y;

    if (j < n && 32 * i < m) {
        float num;
        unsigned int rvalue = 0;
        unsigned int sign;

        for(int k = 0; k < 32; k++) {
            num = a[j + n * (32 * i + k)];
            sign = (num >= 0);
            rvalue = rvalue | (sign << k);
        }
        b[j + n * i] = rvalue;
    }
}
"""

    module = SourceModule(code)
    kernel = module.get_function("pack_cols")
    sig = "2P 2I"
    kernel.prepare(sig)
    return kernel

Source File: PatchMatchCuda.py From pyLucid with MIT License

5 votes

def propagate(self, iters=2, rand_search_radius=500):
        """
        Optimize the NNF using PatchMatch Algorithm
        :param iters: number of iterations
        :param rand_search_radius: max radius to use in random search
        :return:
        """
        mod = SourceModule(open(os.path.join(package_directory,"patchmatch.cu")).read(),no_extern_c=True)
        patchmatch = mod.get_function("patch_match")
        
        rows = self.A.shape[0]
        cols = self.A.shape[1]
        channels = np.int32(self.A.shape[2])
        nnf_t = np.zeros(shape=(rows,cols),dtype=np.uint32)
        threads = 20
        
        def get_blocks_for_dim(dim,blocks):
            #if dim % blocks ==0:
            #    return dim//blocks
            return dim// blocks +1 
        patchmatch(
            drv.In(self.A),
            drv.In(self.AA),
            drv.In(self.B),
            drv.In(self.BB),
            drv.InOut(self.nnf),
            drv.InOut(nnf_t),
            drv.InOut(self.nnd),
            np.int32(rows),
            np.int32(cols),
            channels,
            np.int32(self.patch_size),
            np.int32(iters),
            np.int32(8),
            np.int32(rand_search_radius),
        block=(threads,threads,1),
        grid=(get_blocks_for_dim(rows,threads),
              get_blocks_for_dim(cols,threads)))

Source File: threshold_cuda.py From pycbc with GNU General Public License v3.0

5 votes

def get_tkernel(slen, window):
    if window < 32:
        raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples")

    elif window <= 4096:
        nt = 128
    elif window <= 16384:
        nt = 256
    elif window <= 32768:
        nt = 512
    else:
        nt = 1024

    nb = int(numpy.ceil(slen / float(window)))

    if nb > 1024:
        raise ValueError("More than 1024 blocks not supported yet")

    try:
        return tfn_cache[(nt, nb)], nt, nb
    except KeyError:
        mod = SourceModule(tkernel1.render(chunk=nt))
        mod2 = SourceModule(tkernel2.render(blocks=nb))
        fn = mod.get_function("threshold_and_cluster")
        fn.prepare("PPPif")
        fn2 = mod2.get_function("threshold_and_cluster2")
        fn2.prepare("PPfi")
        tfn_cache[(nt, nb)] = (fn, fn2)
        return tfn_cache[(nt, nb)], nt, nb

Source File: chisq_cuda.py From pycbc with GNU General Public License v3.0

5 votes

def get_pchisq_fn(np, fuse_correlate=False):
    if np not in _pchisq_cache:
        nt = 256
        mod = SourceModule(chisqkernel.render(NT=nt, NP=np, fuse=fuse_correlate))
        fn = mod.get_function("power_chisq_at_points_%s" % (np))
        if fuse_correlate:
            fn.prepare("PPPI" + "f" * np + "PPPI")
        else:
            fn.prepare("PPI" + "f" * np + "PPPI")
        _pchisq_cache[np] = (fn, nt)
    return _pchisq_cache[np]

Source File: chisq_cuda.py From pycbc with GNU General Public License v3.0

5 votes

def get_pchisq_fn_pow2(np, fuse_correlate=False):
    if np not in _pchisq_cache_pow2:
        nt = 256
        mod = SourceModule(chisqkernel_pow2.render(NT=nt, NP=np,
                                                   fuse=fuse_correlate))
        fn = mod.get_function("power_chisq_at_points_%s_pow2" % (np))
        if fuse_correlate:
            fn.prepare("PPPI" + "I" * np + "PPPI")
        else:
            fn.prepare("PPI" + "I" * np + "PPPI")
        _pchisq_cache_pow2[np] = (fn, nt)
    return _pchisq_cache_pow2[np]

Source File: cuda_codegen.py From ProxImaL with MIT License

5 votes

def compile_cuda_kernel(cuda_kernel_code):
    """
    compiles a cuda kernel and return compiled module
    """
    try:
        cuda_code = cuda_kernel_code if 1 else replace_local_floats_with_double(cuda_kernel_code)
        logging.debug("Compiling cuda code:\n" + cuda_code)
        mod = SourceModule(cuda_code, options=DEFAULT_NVCC_FLAGS + ['--use_fast_math'])
    except cuda.CompileError as e:
        logging.error(cuda_code)
        logging.error("CUDA compilation error:")
        logging.error(e.stderr)
        raise e
    return mod

Source File: pycuda_example.py From attention-lvcsr with MIT License

5 votes

def make_node(self, *inputs):
        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
        if self.nin > 0 and len(_inputs) != self.nin:
            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
        for i in _inputs[1:]:
            if i.type.ndim != inputs[0].type.ndim:
                raise TypeError('different ranks among inputs')

        if any([any(i.type.broadcastable) for i in inputs]):
            raise Exception("pycuda don't support broadcasted dimensions")
        assert len(inputs) == 2  # TODO remove

        otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
        assert self.nout == 1

        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
        in_name = ["i" + str(id) for id in range(len(inputs))]
        out_name = ["o" + str(id) for id in range(self.nout)]
        c_code = self.scalar_op.c_code(out_node, "some_name",
                                       tuple([n + "[i]" for n in in_name]),
                                       tuple(n + "[i]" for n in out_name), {})
        c_code_param = ", ".join(
            [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
             for var, name in chain(izip(inputs, in_name),
                                    izip(out_node.outputs, out_name))] +
            ["int size"])
        mod = SourceModule("""
  __global__ void %s(%s)
  {
    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
    i += threadIdx.x + threadIdx.y*blockDim.x;
    if(i<size){
        %s
    }
  }
  """ % (fct_name, c_code_param, c_code))
        self.pycuda_fct = mod.get_function(fct_name)
        return out_node

Source File: flex_conv.py From ngraph-python with Apache License 2.0

5 votes

def _get_shuffle_kernel(dtype):
    code = _shuffle_kernel % {
        "type": _get_register_type(dtype, memory=True)
    }
    module = SourceModule(code)
    kernel = module.get_function("dimShuffle")
    kernel.prepare("PPIIIIIIIIIIIIII")
    return kernel

Source File: flex_conv.py From ngraph-python with Apache License 2.0

5 votes

def _get_transpose_kernel(dtype):

    code = _transpose_kernel % {
        "type": _get_register_type(dtype, memory=True)
    }
    module = SourceModule(code)
    kernel = module.get_function("transpose")
    kernel.prepare("PPII")
    return kernel

Source File: float_ew2.py From ngraph-python with Apache License 2.0

5 votes

def _prepare_compound_kernel(transformer, ops):
    """
    Generate and return a kernel given a set of ops.

    ops (list): List of tuples describing ops to execute in kernel. Each tuple
        should be of the format (op_name, input0, input1, output, axis)
    """
    # Take care tensor dimensionality
    ops = _wrap_tensor_descriptions(transformer, ops)

    # Generate kernel source code and block/grid mapping
    (axes_mapping, dims) = _get_axes_mapping(ops)
    code, kernel_name, arg_desc, params = _get_compound_kernel(ops, axes_mapping, dims)

    # Compile kernel
    if _are_flex_params(params):
        code = _includes_template + _flex_includes_template + code
    else:
        code = _includes_template + code

    module = SourceModule(code, options=[])
    kernel = module.get_function(kernel_name)
    kernel.name = kernel_name
    kernel.prepare(arg_desc)

    # Calculate block and grid dims
    blockdim = [1, 1, 1]
    griddim = [1, 1, 1]
    for axis in axes_mapping:
        if axis[0] == 'x':
            blockdim[0] = axis[1]
            griddim[0] = axis[2]
        elif axis[0] == 'y':
            blockdim[1] = axis[1]
            griddim[1] = axis[2]
        elif axis[0] == 'z':
            blockdim[2] = axis[1]
            griddim[2] = axis[2]

    params = [tuple(griddim), tuple(blockdim), None] + params
    return (kernel, params, 128)

Source File: pycuda_example.py From D-VAE with MIT License

5 votes

def make_node(self, *inputs):
        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
        if self.nin > 0 and len(_inputs) != self.nin:
            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
        for i in _inputs[1:]:
            if i.type.ndim != inputs[0].type.ndim:
                raise TypeError('different ranks among inputs')

        if any([any(i.type.broadcastable) for i in inputs]):
            raise Exception("pycuda don't support broadcasted dimensions")
        assert len(inputs) == 2  # TODO remove

        otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
        assert self.nout == 1

        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
        in_name = ["i" + str(id) for id in range(len(inputs))]
        out_name = ["o" + str(id) for id in range(self.nout)]
        c_code = self.scalar_op.c_code(out_node, "some_name",
                                       tuple([n + "[i]" for n in in_name]),
                                       tuple(n + "[i]" for n in out_name), {})
        c_code_param = ", ".join(
            [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
             for var, name in chain(izip(inputs, in_name),
                                    izip(out_node.outputs, out_name))] +
            ["int size"])
        mod = SourceModule("""
  __global__ void %s(%s)
  {
    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
    i += threadIdx.x + threadIdx.y*blockDim.x;
    if(i<size){
        %s
    }
  }
  """ % (fct_name, c_code_param, c_code))
        self.pycuda_fct = mod.get_function(fct_name)
        return out_node

Source File: m_div_eigenenergy_cuda.py From pyscf with Apache License 2.0

5 votes

def div_eigenenergy_cuda(ksn2e, ksn2f, nfermi, vstart, comega, nm2v_re, nm2v_im,
        block_size, grid_size):

    block = (int(block_size[0]), int(block_size[1]), int(1))
    grid = (int(grid_size[0]), int(grid_size[1]))

    mod = SourceModule(kernel_code_div_eigenenergy_cuda)
    calc_XXVV = mod.get_function("calc_XXVV_gpu")
    calc_XXVV(nm2v_re, nm2v_im, np.int32(nm2v_re.shape[0]),
        np.int32(nm2v_re.shape[1]), ksn2e, ksn2f, np.int32(nfermi),
        np.int32(vstart), np.int32(ksn2e.shape[0]), np.float64(comega.real),
        np.float64(comega.imag), block = block, grid = grid)

Source File: dihedral_ops.py From kaggle-ndsb with MIT License

4 votes

def make_thunk(self, node, storage_map, _, _2):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        mod = SourceModule("""
            __global__ void cyclic_roll(float * input, float * output, int batch_size, int num_features) {
                int x = blockIdx.x*blockDim.x + threadIdx.x; // feature dim, fastest varying index!
                int y = blockIdx.y*blockDim.y + threadIdx.y; // batch dim

                int height = 4 * batch_size;
                int width = 4 * num_features;

                if (x < num_features && y < height) {
                    for (int i = 0; i < 4; i++) {
                        int y_out = (y + batch_size * (4 - i)) % height;
                        int x_out = x + num_features * i;

                        output[y_out * width + x_out] = input[y * num_features + x];
                    }
                }
            }""")
        kernel = mod.get_function("cyclic_roll")

        def thunk():
            in_shape = inputs[0][0].shape
            rows, cols = in_shape

            assert rows % 4 == 0

            out_shape = (rows, 4 * cols)
            
            batch_size = rows // 4
            num_features = cols

            out = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if out[0] is None or out[0].shape != out_shape:
                out[0] = cuda.CudaNdarray.zeros(out_shape)

            x_block = 16
            y_block = 16
            block = (x_block, y_block, 1)

            x_grid = int(np.ceil(float(in_shape[1]) / x_block))
            y_grid = int(np.ceil(float(in_shape[0]) / y_block))
            grid = (x_grid, y_grid, 1)

            kernel(inputs[0][0], out[0], np.intc(batch_size), np.intc(num_features), block=block, grid=grid)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk

Source File: dihedral_ops.py From kaggle-ndsb with MIT License

4 votes

def make_thunk(self, node, storage_map, _, _2):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        mod = SourceModule("""
            __global__ void cyclic_roll_grad(float * input, float * output, int batch_size, int num_features) {
                int x = blockIdx.x*blockDim.x + threadIdx.x; // feature dim, fastest varying index!
                int y = blockIdx.y*blockDim.y + threadIdx.y; // batch dim

                int height = 4 * batch_size;
                int width = 4 * num_features;

                float val = 0;

                if (x < num_features && y < height) {
                    for (int i = 0; i < 4; i++) {
                        int y_in = (y + batch_size * (4 - i)) % height;
                        int x_in = x + num_features * i;

                        val += input[y_in * width + x_in];
                    }

                    output[y * num_features + x] = val;
                }
            }""")
        kernel = mod.get_function("cyclic_roll_grad")

        def thunk():
            in_shape = inputs[0][0].shape
            rows, cols = in_shape
            
            assert rows % 4 == 0
            assert cols % 4 == 0

            out_shape = (rows, cols // 4)
            
            batch_size = rows // 4
            num_features = cols // 4

            out = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if out[0] is None or out[0].shape != out_shape:
                out[0] = cuda.CudaNdarray.zeros(out_shape)

            x_block = 16
            y_block = 16
            block = (x_block, y_block, 1)

            x_grid = int(np.ceil(float(out_shape[1]) / x_block))
            y_grid = int(np.ceil(float(out_shape[0]) / y_block))
            grid = (x_grid, y_grid, 1)

            kernel(inputs[0][0], out[0], np.intc(batch_size), np.intc(num_features), block=block, grid=grid)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk

Source File: pycuda_example.py From attention-lvcsr with MIT License

4 votes

def make_thunk(self, node, storage_map, _, _2):
        # TODO support broadcast!
        # TODO assert all input have the same shape
        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
        in_name = ["i" + str(id) for id in range(len(node.inputs))]
        out_name = ["o" + str(id) for id in range(self.nout)]

        c_code = self.scalar_op.c_code(node, "some_name",
                                       tuple([n + "[i]" for n in in_name]),
                                       tuple(n + "[i]" for n in out_name), {})
        c_code_param = ", ".join(
            [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
             for var, name in chain(izip(node.inputs, in_name),
                                    izip(node.outputs, out_name))] +
            ["int size"])

        mod = SourceModule("""
  __global__ void %s(%s)
  {
    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
    i += threadIdx.x + threadIdx.y*blockDim.x;
    if(i<size){
        %s
    }
  }
  """ % (fct_name, c_code_param, c_code))
        pycuda_fct = mod.get_function(fct_name)
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        def thunk():
            z = outputs[0]
            if (z[0] is None or
                    z[0].shape != inputs[0][0].shape or
                    not z[0].is_c_contiguous()):
                z[0] = theano.sandbox.cuda.CudaNdarray.zeros(
                    inputs[0][0].shape)
            if inputs[0][0].shape != inputs[1][0].shape:
                raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:"
                                " inputs don't have the same shape!")

            if inputs[0][0].size > 512:
                grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
                block = (512, 1, 1)
            else:
                grid = (1, 1)
                block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
            pycuda_fct(inputs[0][0], inputs[1][0], z[0],
                       numpy.intc(inputs[1][0].size), block=block,
                       grid=grid)
        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk

Source File: flex_conv.py From ngraph-python with Apache License 2.0

4 votes

def _get_convert_kernel(dtype):

    _convert_kernel = r"""
#include <cuda_fp16.h>

__device__ short iabs(short a)
{
    return (a < 0) ? (-a) : a;
}

__global__ void convert(short* out, const %(type)s* in, int dim,
                        float scale, int* flex_data)
{
    int offset = blockIdx.x * dim;
    int max_val = 0;

    for(int item = threadIdx.x; item < dim; item += 32)
    {
        %(type)s value = in[offset + item];
        short result = (short)(%(cvt)s(value) * scale);
        max_val = max((int)iabs(result), max_val);
        out[offset + item] = result;
    }

    atomicMax(flex_data, max_val);
}
"""
    if dtype == "f4":
        template_vals = {
            "type": "float",
            "cvt": "",
        }
    elif dtype == "f2":
        template_vals = {
            "type": "unsigned short",
            "cvt": "__half2float"
        }
    else:
        raise ValueError("Invalid conversion type")

    code = _convert_kernel % template_vals
    module = SourceModule(code)
    kernel = module.get_function("convert")
    kernel.prepare("PPIfP")
    return kernel

Source File: pycuda_example.py From D-VAE with MIT License

4 votes

def make_thunk(self, node, storage_map, _, _2):
        # TODO support broadcast!
        # TODO assert all input have the same shape
        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
        in_name = ["i" + str(id) for id in range(len(node.inputs))]
        out_name = ["o" + str(id) for id in range(self.nout)]

        c_code = self.scalar_op.c_code(node, "some_name",
                                       tuple([n + "[i]" for n in in_name]),
                                       tuple(n + "[i]" for n in out_name), {})
        c_code_param = ", ".join(
            [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
             for var, name in chain(izip(node.inputs, in_name),
                                    izip(node.outputs, out_name))] +
            ["int size"])

        mod = SourceModule("""
  __global__ void %s(%s)
  {
    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
    i += threadIdx.x + threadIdx.y*blockDim.x;
    if(i<size){
        %s
    }
  }
  """ % (fct_name, c_code_param, c_code))
        pycuda_fct = mod.get_function(fct_name)
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        def thunk():
            z = outputs[0]
            if (z[0] is None or
                    z[0].shape != inputs[0][0].shape or
                    not z[0].is_c_contiguous()):
                z[0] = theano.sandbox.cuda.CudaNdarray.zeros(
                    inputs[0][0].shape)
            if inputs[0][0].shape != inputs[1][0].shape:
                raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:"
                                " inputs don't have the same shape!")

            if inputs[0][0].size > 512:
                grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
                block = (512, 1, 1)
            else:
                grid = (1, 1)
                block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
            pycuda_fct(inputs[0][0], inputs[1][0], z[0],
                       numpy.intc(inputs[1][0].size), block=block,
                       grid=grid)
        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk

Python pycuda.compiler.SourceModule() Examples