Python pycuda.compiler.SourceModule() Examples

The following are 30 code examples of pycuda.compiler.SourceModule(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module pycuda.compiler , or try the search function .
Example #1
Source File: eclat.py    From Apriori-and-Eclat-Frequent-Itemset-Mining with MIT License 6 votes vote down vote up
def __init__(self, num_trans, min_support, use_CUDA, block, thread, use_optimal=True):
		self.num_trans = num_trans
		self.min_support = min_support * num_trans
		self.support_list = {}
		self.use_CUDA = use_CUDA
		self.use_optimal = use_optimal
		if self.use_CUDA and not self.use_optimal:
			assert block != None and thread != None
			mod = SourceModule("""__global__ void multiply_element(int *dest, int *a, int *b) {
								const int idx = threadIdx.x + blockDim.x * blockIdx.x;
								dest[idx] = a[idx] * b[idx];
							   }""")
			self.multiply = mod.get_function("multiply_element")
			self.block = (block, thread, 1)
			dx, mx = divmod(self.num_trans, self.block[0])
			dy, my = divmod(1, self.block[1])
			self.grid = (int(dx + (mx>0)), int(dy + (my>0)))
			print("Using Block =", self.block)
			print("Using Grid =", self.grid)
		elif self.use_CUDA:
			print("Accelerating Eclat computation with GPU!")
		else:
			print("Not using GPU for acceleration.") 
Example #2
Source File: gpu.py    From Jamais-Vu with MIT License 6 votes vote down vote up
def maximum_filter_2d(arr2D, footprint): ## Make sure arr2D is our datatype float32 and footprint of int32
    arr2DMaxed = numpy.empty_like(arr2D)
    head, tail = os.path.split(os.path.abspath(__file__)) # Used so that we can always get the kernel which should be in the same directory as this file

    maxFunction = open(head + "/2DSlidingMaxFootprintKernel.c", "rt")
    maxFunction = SourceModule(maxFunction.read())
    slidingMaxKernel = maxFunction.get_function("slidingMaxiumum2D")

    blockSize = [16, 16] # To-do: Add a variable to this, can affect performance based on GPU
    gridSize = getGridSize(blockSize, arr2D.shape) # Get the size of our grid based on the size of a grid (blocksize)


    slidingMaxKernel(cuda.In(arr2D),                   # Input
                    cuda.Out(arr2DMaxed),              # Output
                    numpy.int32(footprint.shape[1]),   # Kernel Size
                    numpy.int32(arr2D.shape[1]),       # Row Stride
                    numpy.int32(1),                    # Column Stride
                    numpy.int32(int(arr2D.shape[1])),  # Array Column Count
                    numpy.int32(int(arr2D.shape[0])),  # Array Row Count
                    cuda.In(footprint),
                    block=(blockSize[0],blockSize[1],1),
                    grid=(gridSize[0],gridSize[1],1)
    )

    return arr2DMaxed 
Example #3
Source File: pycuda_double_op.py    From attention-lvcsr with MIT License 6 votes vote down vote up
def make_thunk(self, node, storage_map, _, _2):
        mod = SourceModule("""
    __global__ void my_fct(float * i0, float * o0, int size) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if(i<size){
        o0[i] = i0[i]*2;
    }
  }""")
        pycuda_fct = mod.get_function("my_fct")
        inputs = [ storage_map[v] for v in node.inputs]
        outputs = [ storage_map[v] for v in node.outputs]
        def thunk():
            z = outputs[0]
            if z[0] is None or z[0].shape!=inputs[0][0].shape:
                z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
            grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
            pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
                       block=(512,1,1), grid=grid)

        return thunk 
Example #4
Source File: test_pycuda_theano_simple.py    From attention-lvcsr with MIT License 6 votes vote down vote up
def test_pycuda_theano():
    """Simple example with pycuda function and Theano CudaNdarray object."""
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

    multiply_them = mod.get_function("multiply_them")

    a = numpy.random.randn(100).astype(numpy.float32)
    b = numpy.random.randn(100).astype(numpy.float32)

    # Test with Theano object
    ga = cuda_ndarray.CudaNdarray(a)
    gb = cuda_ndarray.CudaNdarray(b)
    dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
    multiply_them(dest, ga, gb,
                  block=(400, 1, 1), grid=(1, 1))
    assert (numpy.asarray(dest) == a * b).all() 
Example #5
Source File: test_pycuda_theano_simple.py    From D-VAE with MIT License 6 votes vote down vote up
def test_pycuda_only():
    """Run pycuda only example to test that pycuda works."""
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

    multiply_them = mod.get_function("multiply_them")

    # Test with pycuda in/out of numpy.ndarray
    a = numpy.random.randn(100).astype(numpy.float32)
    b = numpy.random.randn(100).astype(numpy.float32)
    dest = numpy.zeros_like(a)
    multiply_them(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(400, 1, 1), grid=(1, 1))
    assert (dest == a * b).all() 
Example #6
Source File: test_pycuda_theano_simple.py    From D-VAE with MIT License 6 votes vote down vote up
def test_pycuda_theano():
    """Simple example with pycuda function and Theano CudaNdarray object."""
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

    multiply_them = mod.get_function("multiply_them")

    a = numpy.random.randn(100).astype(numpy.float32)
    b = numpy.random.randn(100).astype(numpy.float32)

    # Test with Theano object
    ga = cuda_ndarray.CudaNdarray(a)
    gb = cuda_ndarray.CudaNdarray(b)
    dest = cuda_ndarray.CudaNdarray.zeros(a.shape)
    multiply_them(dest, ga, gb,
                  block=(400, 1, 1), grid=(1, 1))
    assert (numpy.asarray(dest) == a * b).all() 
Example #7
Source File: pycuda_double_op.py    From D-VAE with MIT License 6 votes vote down vote up
def make_thunk(self, node, storage_map, _, _2):
        mod = SourceModule("""
    __global__ void my_fct(float * i0, float * o0, int size) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if(i<size){
        o0[i] = i0[i]*2;
    }
  }""")
        pycuda_fct = mod.get_function("my_fct")
        inputs = [ storage_map[v] for v in node.inputs]
        outputs = [ storage_map[v] for v in node.outputs]
        def thunk():
            z = outputs[0]
            if z[0] is None or z[0].shape!=inputs[0][0].shape:
                z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
            grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
            pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
                       block=(512,1,1), grid=grid)

        return thunk 
Example #8
Source File: decompress_cuda.py    From pycbc with GNU General Public License v3.0 6 votes vote down vote up
def get_dckernel(slen):
    # Right now, hardcoding the number of threads per block
    nt = 1024
    nb = int(numpy.ceil(slen / 1024.0))

    if nb > 1024:
        raise ValueError("More than 1024 blocks not supported yet")

    try:
        return dckernel_cache[nb]
    except KeyError:
        mod = SourceModule(kernel_sources.render(ntpb=nt, nblocks=nb))
        freq_tex = mod.get_texref("freq_tex")
        amp_tex = mod.get_texref("amp_tex")
        phase_tex = mod.get_texref("phase_tex")
        fn1 = mod.get_function("find_block_indices")
        fn1.prepare("PPifff", texrefs=[freq_tex])
        fn2 = mod.get_function("linear_interp")
        fn2.prepare("PfiffiPP", texrefs=[freq_tex, amp_tex, phase_tex])
        dckernel_cache[nb] = (fn1, fn2, freq_tex, amp_tex, phase_tex, nt, nb)
        return dckernel_cache[nb] 
Example #9
Source File: test_pycuda_theano_simple.py    From attention-lvcsr with MIT License 6 votes vote down vote up
def test_pycuda_only():
    """Run pycuda only example to test that pycuda works."""
    from pycuda.compiler import SourceModule
    mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

    multiply_them = mod.get_function("multiply_them")

    # Test with pycuda in/out of numpy.ndarray
    a = numpy.random.randn(100).astype(numpy.float32)
    b = numpy.random.randn(100).astype(numpy.float32)
    dest = numpy.zeros_like(a)
    multiply_them(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(400, 1, 1), grid=(1, 1))
    assert (dest == a * b).all() 
Example #10
Source File: binary.py    From neon with Apache License 2.0 6 votes vote down vote up
def pack_rows():

    code = pack() + r"""
__global__ void pack_rows(float *a, unsigned int *b, int size) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < size) {
        b[i] = pack(&a[i * 32]);
    }
}
"""

    module = SourceModule(code)
    kernel = module.get_function("pack_rows")
    sig = "2P I"
    kernel.prepare(sig)
    return kernel 
Example #11
Source File: binary.py    From neon with Apache License 2.0 5 votes vote down vote up
def shift():

    code = shift_element() + r"""
__global__ void shift(
    float *a, float *b, float *c, bool value, int sizea, int b_rows, int b_cols)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < sizea) {
        float bi;
        if (b_rows > 1 && b_cols > 1) {
            bi = b[i];
        } else if (b_rows > 1 && b_cols == 1) {
            int step = sizea/b_rows;
            bi = b[i/step];
        } else if (b_rows == 1 && b_cols > 1) {
            bi = b[i % b_cols];
        } else if (b_rows == 1 && b_cols == 1) {
            bi = b[0];
        }

        c[i] = shift_element(a[i], bi, value);
    }
}
"""

    module = SourceModule(code)
    kernel = module.get_function("shift")
    sig = "3P 4I"
    kernel.prepare(sig)
    return kernel 
Example #12
Source File: monte_carlo_integrator.py    From Hands-On-GPU-Programming-with-Python-and-CUDA with MIT License 5 votes vote down vote up
def __init__(self, math_function='y = sin(x)', precision='d', lo=0, hi=np.pi, samples_per_thread=10**5, num_blocks=100):
        
        self.math_function = math_function
        
        if precision in [None, 's', 'S', 'single', np.float32]:
            self.precision = 'float'
            self.numpy_precision = np.float32
            self.p_curand = ''
        elif precision in ['d','D', 'double', np.float64]:
            self.precision = 'double'
            self.numpy_precision = np.float64
            self.p_curand = '_double'
        else:
            raise Exception('precision is invalid datatype!')
            
        if (hi - lo <= 0):
            raise Exception('hi - lo <= 0!')
        else:
            self.hi = hi
            self.lo = lo
              
        MonteCarloDict = {'p' : self.precision, 'p_curand' : self.p_curand, 'math_function' : self.math_function}
        
        self.MonteCarloCode = MonteCarloKernelTemplate % MonteCarloDict
        
        self.ker = SourceModule(no_extern_c=True , options=['-w'], source=self.MonteCarloCode)
        
        self.f = self.ker.get_function('monte_carlo')
        
        self.num_blocks = num_blocks
        
        self.samples_per_thread = samples_per_thread 
Example #13
Source File: multi-kernel_multi-thread.py    From Hands-On-GPU-Programming-with-Python-and-CUDA with MIT License 5 votes vote down vote up
def run(self):
        self.dev = drv.Device(0)
        self.context = self.dev.make_context()
        
        self.ker = SourceModule(kernel_code)

        self.mult_ker = self.ker.get_function('mult_ker')

        self.array_gpu = gpuarray.to_gpu(self.input_array)
        
        self.mult_ker(self.array_gpu, np.int32(array_len), block=(64,1,1), grid=(1,1,1))

        self.output_array = self.array_gpu.get()
        
        self.context.pop() 
Example #14
Source File: binary.py    From neon with Apache License 2.0 5 votes vote down vote up
def pack_cols():

    code = r"""
__global__ void pack_cols(float *a, unsigned int *b, int m, int n) {
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int i = blockIdx.y * blockDim.y + threadIdx.y;

    if (j < n && 32 * i < m) {
        float num;
        unsigned int rvalue = 0;
        unsigned int sign;

        for(int k = 0; k < 32; k++) {
            num = a[j + n * (32 * i + k)];
            sign = (num >= 0);
            rvalue = rvalue | (sign << k);
        }
        b[j + n * i] = rvalue;
    }
}
"""

    module = SourceModule(code)
    kernel = module.get_function("pack_cols")
    sig = "2P 2I"
    kernel.prepare(sig)
    return kernel 
Example #15
Source File: PatchMatchCuda.py    From pyLucid with MIT License 5 votes vote down vote up
def propagate(self, iters=2, rand_search_radius=500):
        """
        Optimize the NNF using PatchMatch Algorithm
        :param iters: number of iterations
        :param rand_search_radius: max radius to use in random search
        :return:
        """
        mod = SourceModule(open(os.path.join(package_directory,"patchmatch.cu")).read(),no_extern_c=True)
        patchmatch = mod.get_function("patch_match")
        
        rows = self.A.shape[0]
        cols = self.A.shape[1]
        channels = np.int32(self.A.shape[2])
        nnf_t = np.zeros(shape=(rows,cols),dtype=np.uint32)
        threads = 20
        
        def get_blocks_for_dim(dim,blocks):
            #if dim % blocks ==0:
            #    return dim//blocks
            return dim// blocks +1 
        patchmatch(
            drv.In(self.A),
            drv.In(self.AA),
            drv.In(self.B),
            drv.In(self.BB),
            drv.InOut(self.nnf),
            drv.InOut(nnf_t),
            drv.InOut(self.nnd),
            np.int32(rows),
            np.int32(cols),
            channels,
            np.int32(self.patch_size),
            np.int32(iters),
            np.int32(8),
            np.int32(rand_search_radius),
        block=(threads,threads,1),
        grid=(get_blocks_for_dim(rows,threads),
              get_blocks_for_dim(cols,threads))) 
Example #16
Source File: threshold_cuda.py    From pycbc with GNU General Public License v3.0 5 votes vote down vote up
def get_tkernel(slen, window):
    if window < 32:
        raise ValueError("GPU threshold kernel does not support a window smaller than 32 samples")

    elif window <= 4096:
        nt = 128
    elif window <= 16384:
        nt = 256
    elif window <= 32768:
        nt = 512
    else:
        nt = 1024

    nb = int(numpy.ceil(slen / float(window)))

    if nb > 1024:
        raise ValueError("More than 1024 blocks not supported yet")

    try:
        return tfn_cache[(nt, nb)], nt, nb
    except KeyError:
        mod = SourceModule(tkernel1.render(chunk=nt))
        mod2 = SourceModule(tkernel2.render(blocks=nb))
        fn = mod.get_function("threshold_and_cluster")
        fn.prepare("PPPif")
        fn2 = mod2.get_function("threshold_and_cluster2")
        fn2.prepare("PPfi")
        tfn_cache[(nt, nb)] = (fn, fn2)
        return tfn_cache[(nt, nb)], nt, nb 
Example #17
Source File: chisq_cuda.py    From pycbc with GNU General Public License v3.0 5 votes vote down vote up
def get_pchisq_fn(np, fuse_correlate=False):
    if np not in _pchisq_cache:
        nt = 256
        mod = SourceModule(chisqkernel.render(NT=nt, NP=np, fuse=fuse_correlate))
        fn = mod.get_function("power_chisq_at_points_%s" % (np))
        if fuse_correlate:
            fn.prepare("PPPI" + "f" * np + "PPPI")
        else:
            fn.prepare("PPI" + "f" * np + "PPPI")
        _pchisq_cache[np] = (fn, nt)
    return _pchisq_cache[np] 
Example #18
Source File: chisq_cuda.py    From pycbc with GNU General Public License v3.0 5 votes vote down vote up
def get_pchisq_fn_pow2(np, fuse_correlate=False):
    if np not in _pchisq_cache_pow2:
        nt = 256
        mod = SourceModule(chisqkernel_pow2.render(NT=nt, NP=np,
                                                   fuse=fuse_correlate))
        fn = mod.get_function("power_chisq_at_points_%s_pow2" % (np))
        if fuse_correlate:
            fn.prepare("PPPI" + "I" * np + "PPPI")
        else:
            fn.prepare("PPI" + "I" * np + "PPPI")
        _pchisq_cache_pow2[np] = (fn, nt)
    return _pchisq_cache_pow2[np] 
Example #19
Source File: cuda_codegen.py    From ProxImaL with MIT License 5 votes vote down vote up
def compile_cuda_kernel(cuda_kernel_code):
    """
    compiles a cuda kernel and return compiled module
    """
    try:
        cuda_code = cuda_kernel_code if 1 else replace_local_floats_with_double(cuda_kernel_code)
        logging.debug("Compiling cuda code:\n" + cuda_code)
        mod = SourceModule(cuda_code, options=DEFAULT_NVCC_FLAGS + ['--use_fast_math'])
    except cuda.CompileError as e:
        logging.error(cuda_code)
        logging.error("CUDA compilation error:")
        logging.error(e.stderr)
        raise e
    return mod 
Example #20
Source File: pycuda_example.py    From attention-lvcsr with MIT License 5 votes vote down vote up
def make_node(self, *inputs):
        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
        if self.nin > 0 and len(_inputs) != self.nin:
            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
        for i in _inputs[1:]:
            if i.type.ndim != inputs[0].type.ndim:
                raise TypeError('different ranks among inputs')

        if any([any(i.type.broadcastable) for i in inputs]):
            raise Exception("pycuda don't support broadcasted dimensions")
        assert len(inputs) == 2  # TODO remove

        otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
        assert self.nout == 1

        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
        in_name = ["i" + str(id) for id in range(len(inputs))]
        out_name = ["o" + str(id) for id in range(self.nout)]
        c_code = self.scalar_op.c_code(out_node, "some_name",
                                       tuple([n + "[i]" for n in in_name]),
                                       tuple(n + "[i]" for n in out_name), {})
        c_code_param = ", ".join(
            [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
             for var, name in chain(izip(inputs, in_name),
                                    izip(out_node.outputs, out_name))] +
            ["int size"])
        mod = SourceModule("""
  __global__ void %s(%s)
  {
    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
    i += threadIdx.x + threadIdx.y*blockDim.x;
    if(i<size){
        %s
    }
  }
  """ % (fct_name, c_code_param, c_code))
        self.pycuda_fct = mod.get_function(fct_name)
        return out_node 
Example #21
Source File: flex_conv.py    From ngraph-python with Apache License 2.0 5 votes vote down vote up
def _get_shuffle_kernel(dtype):
    code = _shuffle_kernel % {
        "type": _get_register_type(dtype, memory=True)
    }
    module = SourceModule(code)
    kernel = module.get_function("dimShuffle")
    kernel.prepare("PPIIIIIIIIIIIIII")
    return kernel 
Example #22
Source File: flex_conv.py    From ngraph-python with Apache License 2.0 5 votes vote down vote up
def _get_transpose_kernel(dtype):

    code = _transpose_kernel % {
        "type": _get_register_type(dtype, memory=True)
    }
    module = SourceModule(code)
    kernel = module.get_function("transpose")
    kernel.prepare("PPII")
    return kernel 
Example #23
Source File: float_ew2.py    From ngraph-python with Apache License 2.0 5 votes vote down vote up
def _prepare_compound_kernel(transformer, ops):
    """
    Generate and return a kernel given a set of ops.

    ops (list): List of tuples describing ops to execute in kernel. Each tuple
        should be of the format (op_name, input0, input1, output, axis)
    """
    # Take care tensor dimensionality
    ops = _wrap_tensor_descriptions(transformer, ops)

    # Generate kernel source code and block/grid mapping
    (axes_mapping, dims) = _get_axes_mapping(ops)
    code, kernel_name, arg_desc, params = _get_compound_kernel(ops, axes_mapping, dims)

    # Compile kernel
    if _are_flex_params(params):
        code = _includes_template + _flex_includes_template + code
    else:
        code = _includes_template + code

    module = SourceModule(code, options=[])
    kernel = module.get_function(kernel_name)
    kernel.name = kernel_name
    kernel.prepare(arg_desc)

    # Calculate block and grid dims
    blockdim = [1, 1, 1]
    griddim = [1, 1, 1]
    for axis in axes_mapping:
        if axis[0] == 'x':
            blockdim[0] = axis[1]
            griddim[0] = axis[2]
        elif axis[0] == 'y':
            blockdim[1] = axis[1]
            griddim[1] = axis[2]
        elif axis[0] == 'z':
            blockdim[2] = axis[1]
            griddim[2] = axis[2]

    params = [tuple(griddim), tuple(blockdim), None] + params
    return (kernel, params, 128) 
Example #24
Source File: pycuda_example.py    From D-VAE with MIT License 5 votes vote down vote up
def make_node(self, *inputs):
        _inputs = [gpu_contiguous(as_cuda_ndarray_variable(i)) for i in inputs]
        if self.nin > 0 and len(_inputs) != self.nin:
            raise TypeError('Wrong argument count', (self.nin, len(_inputs)))
        for i in _inputs[1:]:
            if i.type.ndim != inputs[0].type.ndim:
                raise TypeError('different ranks among inputs')

        if any([any(i.type.broadcastable) for i in inputs]):
            raise Exception("pycuda don't support broadcasted dimensions")
        assert len(inputs) == 2  # TODO remove

        otype = CudaNdarrayType(broadcastable=[False] * _inputs[0].type.ndim)
        assert self.nout == 1

        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
        out_node = Apply(self, _inputs, [otype() for o in xrange(self.nout)])
        in_name = ["i" + str(id) for id in range(len(inputs))]
        out_name = ["o" + str(id) for id in range(self.nout)]
        c_code = self.scalar_op.c_code(out_node, "some_name",
                                       tuple([n + "[i]" for n in in_name]),
                                       tuple(n + "[i]" for n in out_name), {})
        c_code_param = ", ".join(
            [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
             for var, name in chain(izip(inputs, in_name),
                                    izip(out_node.outputs, out_name))] +
            ["int size"])
        mod = SourceModule("""
  __global__ void %s(%s)
  {
    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
    i += threadIdx.x + threadIdx.y*blockDim.x;
    if(i<size){
        %s
    }
  }
  """ % (fct_name, c_code_param, c_code))
        self.pycuda_fct = mod.get_function(fct_name)
        return out_node 
Example #25
Source File: m_div_eigenenergy_cuda.py    From pyscf with Apache License 2.0 5 votes vote down vote up
def div_eigenenergy_cuda(ksn2e, ksn2f, nfermi, vstart, comega, nm2v_re, nm2v_im,
        block_size, grid_size):

    block = (int(block_size[0]), int(block_size[1]), int(1))
    grid = (int(grid_size[0]), int(grid_size[1]))

    mod = SourceModule(kernel_code_div_eigenenergy_cuda)
    calc_XXVV = mod.get_function("calc_XXVV_gpu")
    calc_XXVV(nm2v_re, nm2v_im, np.int32(nm2v_re.shape[0]),
        np.int32(nm2v_re.shape[1]), ksn2e, ksn2f, np.int32(nfermi),
        np.int32(vstart), np.int32(ksn2e.shape[0]), np.float64(comega.real),
        np.float64(comega.imag), block = block, grid = grid) 
Example #26
Source File: dihedral_ops.py    From kaggle-ndsb with MIT License 4 votes vote down vote up
def make_thunk(self, node, storage_map, _, _2):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        mod = SourceModule("""
            __global__ void cyclic_roll(float * input, float * output, int batch_size, int num_features) {
                int x = blockIdx.x*blockDim.x + threadIdx.x; // feature dim, fastest varying index!
                int y = blockIdx.y*blockDim.y + threadIdx.y; // batch dim

                int height = 4 * batch_size;
                int width = 4 * num_features;

                if (x < num_features && y < height) {
                    for (int i = 0; i < 4; i++) {
                        int y_out = (y + batch_size * (4 - i)) % height;
                        int x_out = x + num_features * i;

                        output[y_out * width + x_out] = input[y * num_features + x];
                    }
                }
            }""")
        kernel = mod.get_function("cyclic_roll")

        def thunk():
            in_shape = inputs[0][0].shape
            rows, cols = in_shape

            assert rows % 4 == 0

            out_shape = (rows, 4 * cols)
            
            batch_size = rows // 4
            num_features = cols

            out = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if out[0] is None or out[0].shape != out_shape:
                out[0] = cuda.CudaNdarray.zeros(out_shape)

            x_block = 16
            y_block = 16
            block = (x_block, y_block, 1)

            x_grid = int(np.ceil(float(in_shape[1]) / x_block))
            y_grid = int(np.ceil(float(in_shape[0]) / y_block))
            grid = (x_grid, y_grid, 1)

            kernel(inputs[0][0], out[0], np.intc(batch_size), np.intc(num_features), block=block, grid=grid)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk 
Example #27
Source File: dihedral_ops.py    From kaggle-ndsb with MIT License 4 votes vote down vote up
def make_thunk(self, node, storage_map, _, _2):
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        mod = SourceModule("""
            __global__ void cyclic_roll_grad(float * input, float * output, int batch_size, int num_features) {
                int x = blockIdx.x*blockDim.x + threadIdx.x; // feature dim, fastest varying index!
                int y = blockIdx.y*blockDim.y + threadIdx.y; // batch dim

                int height = 4 * batch_size;
                int width = 4 * num_features;

                float val = 0;

                if (x < num_features && y < height) {
                    for (int i = 0; i < 4; i++) {
                        int y_in = (y + batch_size * (4 - i)) % height;
                        int x_in = x + num_features * i;

                        val += input[y_in * width + x_in];
                    }

                    output[y * num_features + x] = val;
                }
            }""")
        kernel = mod.get_function("cyclic_roll_grad")

        def thunk():
            in_shape = inputs[0][0].shape
            rows, cols = in_shape
            
            assert rows % 4 == 0
            assert cols % 4 == 0

            out_shape = (rows, cols // 4)
            
            batch_size = rows // 4
            num_features = cols // 4

            out = outputs[0]

            # only allocate if there is no previous allocation of the right size.
            if out[0] is None or out[0].shape != out_shape:
                out[0] = cuda.CudaNdarray.zeros(out_shape)

            x_block = 16
            y_block = 16
            block = (x_block, y_block, 1)

            x_grid = int(np.ceil(float(out_shape[1]) / x_block))
            y_grid = int(np.ceil(float(out_shape[0]) / y_block))
            grid = (x_grid, y_grid, 1)

            kernel(inputs[0][0], out[0], np.intc(batch_size), np.intc(num_features), block=block, grid=grid)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk 
Example #28
Source File: pycuda_example.py    From attention-lvcsr with MIT License 4 votes vote down vote up
def make_thunk(self, node, storage_map, _, _2):
        # TODO support broadcast!
        # TODO assert all input have the same shape
        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
        in_name = ["i" + str(id) for id in range(len(node.inputs))]
        out_name = ["o" + str(id) for id in range(self.nout)]

        c_code = self.scalar_op.c_code(node, "some_name",
                                       tuple([n + "[i]" for n in in_name]),
                                       tuple(n + "[i]" for n in out_name), {})
        c_code_param = ", ".join(
            [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
             for var, name in chain(izip(node.inputs, in_name),
                                    izip(node.outputs, out_name))] +
            ["int size"])

        mod = SourceModule("""
  __global__ void %s(%s)
  {
    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
    i += threadIdx.x + threadIdx.y*blockDim.x;
    if(i<size){
        %s
    }
  }
  """ % (fct_name, c_code_param, c_code))
        pycuda_fct = mod.get_function(fct_name)
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        def thunk():
            z = outputs[0]
            if (z[0] is None or
                    z[0].shape != inputs[0][0].shape or
                    not z[0].is_c_contiguous()):
                z[0] = theano.sandbox.cuda.CudaNdarray.zeros(
                    inputs[0][0].shape)
            if inputs[0][0].shape != inputs[1][0].shape:
                raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:"
                                " inputs don't have the same shape!")

            if inputs[0][0].size > 512:
                grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
                block = (512, 1, 1)
            else:
                grid = (1, 1)
                block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
            pycuda_fct(inputs[0][0], inputs[1][0], z[0],
                       numpy.intc(inputs[1][0].size), block=block,
                       grid=grid)
        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk 
Example #29
Source File: flex_conv.py    From ngraph-python with Apache License 2.0 4 votes vote down vote up
def _get_convert_kernel(dtype):

    _convert_kernel = r"""
#include <cuda_fp16.h>

__device__ short iabs(short a)
{
    return (a < 0) ? (-a) : a;
}

__global__ void convert(short* out, const %(type)s* in, int dim,
                        float scale, int* flex_data)
{
    int offset = blockIdx.x * dim;
    int max_val = 0;

    for(int item = threadIdx.x; item < dim; item += 32)
    {
        %(type)s value = in[offset + item];
        short result = (short)(%(cvt)s(value) * scale);
        max_val = max((int)iabs(result), max_val);
        out[offset + item] = result;
    }

    atomicMax(flex_data, max_val);
}
"""
    if dtype == "f4":
        template_vals = {
            "type": "float",
            "cvt": "",
        }
    elif dtype == "f2":
        template_vals = {
            "type": "unsigned short",
            "cvt": "__half2float"
        }
    else:
        raise ValueError("Invalid conversion type")

    code = _convert_kernel % template_vals
    module = SourceModule(code)
    kernel = module.get_function("convert")
    kernel.prepare("PPIfP")
    return kernel 
Example #30
Source File: pycuda_example.py    From D-VAE with MIT License 4 votes vote down vote up
def make_thunk(self, node, storage_map, _, _2):
        # TODO support broadcast!
        # TODO assert all input have the same shape
        fct_name = "pycuda_elemwise_%s" % str(self.scalar_op)
        in_name = ["i" + str(id) for id in range(len(node.inputs))]
        out_name = ["o" + str(id) for id in range(self.nout)]

        c_code = self.scalar_op.c_code(node, "some_name",
                                       tuple([n + "[i]" for n in in_name]),
                                       tuple(n + "[i]" for n in out_name), {})
        c_code_param = ", ".join(
            [_replace_npy_types(var.type.dtype_specs()[1]) + " *" + name
             for var, name in chain(izip(node.inputs, in_name),
                                    izip(node.outputs, out_name))] +
            ["int size"])

        mod = SourceModule("""
  __global__ void %s(%s)
  {
    int i = (blockIdx.x+blockIdx.y*gridDim.x)*(blockDim.x*blockDim.y);
    i += threadIdx.x + threadIdx.y*blockDim.x;
    if(i<size){
        %s
    }
  }
  """ % (fct_name, c_code_param, c_code))
        pycuda_fct = mod.get_function(fct_name)
        inputs = [storage_map[v] for v in node.inputs]
        outputs = [storage_map[v] for v in node.outputs]

        def thunk():
            z = outputs[0]
            if (z[0] is None or
                    z[0].shape != inputs[0][0].shape or
                    not z[0].is_c_contiguous()):
                z[0] = theano.sandbox.cuda.CudaNdarray.zeros(
                    inputs[0][0].shape)
            if inputs[0][0].shape != inputs[1][0].shape:
                raise TypeError("PycudaElemwiseSourceModuleMakeThunkOp:"
                                " inputs don't have the same shape!")

            if inputs[0][0].size > 512:
                grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
                block = (512, 1, 1)
            else:
                grid = (1, 1)
                block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
            pycuda_fct(inputs[0][0], inputs[1][0], z[0],
                       numpy.intc(inputs[1][0].size), block=block,
                       grid=grid)
        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk