jcuda.driver.CUfunction Java Exaples

Source File: JCudaDriverSimpleJOGL.java From jcuda-samples with MIT License

6 votes

/**
 * Initialize the JCudaDriver. Note that this has to be done from the
 * same thread that will later use the JCudaDriver API
 */
private void initJCuda()
{
    JCudaDriver.setExceptionsEnabled(true);

    // Create a device and a context
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Prepare the PTX file containing the kernel
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaDriverSimpleGLKernel.cu");
    
    // Load the PTX file containing the kernel
    CUmodule module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain a function pointer to the kernel function. This function
    // will later be called during the animation, in the display 
    // method of this GLEventListener.
    function = new CUfunction();
    cuModuleGetFunction(function, module, "simple_vbo_kernel");
}

Source File: JCudaDriverBasicGraphExample.java From jcuda-samples with MIT License

6 votes

/**
 * Create a CUDA kernel function by compiling the given code using the
 * NVRTC, and obtaining the function with the given name
 * 
 * @param name The name of the function
 * @param code The source code
 * @return The CUDA function
 */
private static CUfunction createFunction(String name, String code)
{
    nvrtcProgram program = new nvrtcProgram();
    nvrtcCreateProgram(program, code, null, 0, null, null);
    nvrtcCompileProgram(program, 0, null);
    
    String programLog[] = new String[1];
    nvrtcGetProgramLog(program, programLog);
    String log = programLog[0].trim();
    if (!log.isEmpty())
    {
        System.err.println("Compilation log for " + name + ":\n" + log);
    }
    
    String[] ptx = new String[1];
    nvrtcGetPTX(program, ptx);
    nvrtcDestroyProgram(program);
    CUmodule module = new CUmodule();
    cuModuleLoadData(module, ptx[0]);
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, name);
    
    return function;
}

Source File: JCudaDriverSimpleLWJGL.java From jcuda-samples with MIT License

6 votes

/**
 * Initialize the JCudaDriver. Note that this has to be done from the
 * same thread that will later use the JCudaDriver API
 */
private void initJCuda()
{
    JCudaDriver.setExceptionsEnabled(true);

    // Create a device and a context
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Prepare the PTX file containing the kernel
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaDriverSimpleGLKernel.cu");
    
    // Load the PTX file containing the kernel
    CUmodule module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain a function pointer to the kernel function. This function
    // will later be called during the animation, in the display 
    // method of this GLEventListener.
    function = new CUfunction();
    cuModuleGetFunction(function, module, "simple_vbo_kernel");
}

Source File: CUDAInnerLoop.java From ocular with GNU General Public License v3.0

6 votes

public void compute(final float[] scores, final float[] whiteObservations, final float[] blackObservations, final int sequenceLength) {
	int gridSizeX = (int) Math.ceil(((double) sequenceLength) / (BLOCK_SIZE_X*ROLL_X));
	int extendedSeqLength = gridSizeX * (BLOCK_SIZE_X*ROLL_X);
	cuMemcpyHtoD(d_Ow, Pointer.to(CudaUtil.extendWithZeros(whiteObservations, (extendedSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT)), (extendedSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT);
	cuMemcpyHtoD(d_Ob, Pointer.to(CudaUtil.extendWithZeros(blackObservations, (extendedSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT)), (extendedSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT);
	for (int tw=minTemplateWidth; tw<=maxTemplateWidth; ++tw) {
		if (templateNumIndices[tw-minTemplateWidth] > 0) {
			CUfunction function = new CUfunction();
			cuModuleGetFunction(function, cudaModule, "compute_emissions_"+tw);
			JCudaDriver.cuFuncSetCacheConfig(function, CUfunc_cache.CU_FUNC_CACHE_PREFER_SHARED);
			JCudaDriver.cuFuncSetSharedMemConfig(function, CUsharedconfig.CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE);
			Pointer kernelParameters = Pointer.to(Pointer.to(new int[] {templateIndicesOffsets[tw-minTemplateWidth]*sequenceLength}), Pointer.to(new int[] {sequenceLength}), Pointer.to(new int[] {templateNumIndices[tw-minTemplateWidth]}), Pointer.to(d_Tw[tw-minTemplateWidth]), Pointer.to(d_Tb[tw-minTemplateWidth]), Pointer.to(d_Ow), Pointer.to(d_Ob), Pointer.to(d_scores));
			int gridSizeY = (int) Math.ceil(((double) templateNumIndices[tw-minTemplateWidth]) / BLOCK_SIZE_Y);
			cuLaunchKernel(function, 
					gridSizeX, gridSizeY, 1,      // Grid dimension
					BLOCK_SIZE_X, BLOCK_SIZE_Y, 1,      // Block dimension
					0, null,               // Shared memory size and stream
					kernelParameters, null // Kernel- and extra parameters
					);
		}
	}
	cuMemcpyDtoH(Pointer.to(scores), d_scores, sequenceLength*totalTemplateNumIndices * Sizeof.FLOAT);
}

Source File: JCudaAbstractKernelTest.java From jcuda with MIT License

6 votes

/**
 * Tries to compile the specified .CU file into a PTX file, loads this
 * PTX file as a module, obtains the specified function from this module
 * and returns it.
 * 
 * @param cuFileName The .CU file name
 * @param functionName The kernel function name
 * @return The function
 * @throws CudaException If an error occurs
 */
protected final CUfunction initialize(
    String cuFileName, String functionName)
{
    // Enable exceptions and omit all subsequent error checks
    JCudaDriver.setExceptionsEnabled(true);
   
    // Initialize the driver and create a context for the first device.
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    String ptxFileName = JCudaTestUtils.preparePtxFile(cuFileName);
    
    // Load the ptx file.
    CUmodule module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain a function pointer to the kernel function.
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, functionName);
    
    return function;
}

Source File: Context.java From OSPREY3 with GNU General Public License v2.0

5 votes

public void launchKernel(CUfunction func, int gridBlocks, int blockThreads, int sharedMemBytes, Pointer pArgs, GpuStream stream) {
	JCudaDriver.cuLaunchKernel(
		func,
		gridBlocks, 1, 1,
		blockThreads, 1, 1,
		sharedMemBytes,
		stream.getStream(),
		pArgs,
		null
	);
}

Source File: Kernel.java From OSPREY3 with GNU General Public License v2.0

5 votes

public Function(String name) {
	func = new CUfunction();
	JCudaDriver.cuModuleGetFunction(func, module, name);
	pArgs = null;
	numBlocks = 1;
	blockThreads = 1;
	sharedMemCalc = new SharedMemCalculator.None();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void sqrt(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorSqrt");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void sqr(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorSqr");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void pow(Matrix A, Matrix B, float val) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorPow");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new float[] {val}), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void min(Matrix A, Matrix B, float val) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorMin");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new float[] {val}), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void max(Matrix A, Matrix B, float val) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorMax");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new float[] {val}), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void mul(Matrix A, Matrix B, Matrix C) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorMul");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(C.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void abs(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorAbs");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: JCudaReduction.java From jcuda-samples with MIT License

5 votes

/**
 * Initialize the context, module, function and other elements used 
 * in this sample
 */
private static void init()
{
    // Initialize the driver API and create a context for the first device
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Create the PTX file by calling the NVCC
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaReductionKernel.cu");
    
    // Load the module from the PTX file
    module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain a function pointer to the "reduce" function.
    function = new CUfunction();
    cuModuleGetFunction(function, module, "reduce");
    
    // Allocate a chunk of temporary memory (must be at least
    // numberOfBlocks * Sizeof.FLOAT)
    deviceBuffer = new CUdeviceptr();
    cuMemAlloc(deviceBuffer, 1024 * Sizeof.FLOAT);
    
}

Source File: JCudaDriverStreamCallbacks.java From jcuda-samples with MIT License

5 votes

/**
 * Initialize the driver API, the {@link #context} and the 
 * kernel {@link #function} 
 */
private static void initialize()
{
    System.out.println("Initializing...");
    
    JCudaDriver.setExceptionsEnabled(true);
    JNvrtc.setExceptionsEnabled(true);

    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    context = new CUcontext();
    cuCtxCreate(context, 0, device);

    nvrtcProgram program = new nvrtcProgram();
    nvrtcCreateProgram(
        program, programSourceCode, null, 0, null, null);
    nvrtcCompileProgram(program, 0, null);
    
    String[] ptx = new String[1];
    nvrtcGetPTX(program, ptx);
    nvrtcDestroyProgram(program);

    CUmodule module = new CUmodule();
    cuModuleLoadData(module, ptx[0]);

    function = new CUfunction();
    cuModuleGetFunction(function, module, "example");
    
    System.out.println("Initializing DONE");
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void scalarSet(Matrix A, float alpha) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorScalarSet");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(new float[] {alpha}), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void scalarAdd(Matrix A, float alpha, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorScalarAdd");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new float[] {alpha}), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void log(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorLog");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void exp(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorExp");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void sign(Matrix A, Matrix B) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorSign");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: CublasUtil.java From murphy with Apache License 2.0

5 votes

private static void div(Matrix A, Matrix B, Matrix C) {
	int n = A.rows*A.cols;
	CUfunction function = new CUfunction();
	cuModuleGetFunction(function, helperModule, "vectorDiv");
	Pointer kernelParameters = Pointer.to(Pointer.to(A.data_d), Pointer.to(B.data_d), Pointer.to(C.data_d), Pointer.to(new int[] {n}));
	int blockSize = Math.min(n, BLOCK_SIZE);
	int gridSizeX = (int) Math.ceil((double) n / blockSize);
	cuLaunchKernel(function,
			gridSizeX, 1, 1,      // Grid dimension
			blockSize, 1, 1,      // Block dimension
			0, null,               // Shared memory size and stream
			kernelParameters, null // Kernel- and extra parameters
			);
	if (DEBUG_SYNC) JCudaDriver.cuCtxSynchronize();
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 1D float texture access
 */
private boolean test_float_1D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = 1;
    ad.NumChannels = 1;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    Pointer pInput = Pointer.to(input_float_1D);
    cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_1D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_1D");
    
    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();
    
    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  1D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 0.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  1D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 2D float texture access
 */
private boolean test_float_2D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.NumChannels = 1;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
    copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copyHD.srcHost = Pointer.to(input_float_2D);
    copyHD.srcPitch = sizeX * Sizeof.FLOAT;
    copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copyHD.dstArray = array;
    copyHD.WidthInBytes = sizeX * Sizeof.FLOAT;
    copyHD.Height = sizeY;
    cuMemcpy2D(copyHD);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_2D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_2D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  2D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 1.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  2D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 3D float texture access
 */
private boolean test_float_3D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.Depth = sizeZ;
    ad.NumChannels = 1;
    cuArray3DCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
    copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copy.srcHost = Pointer.to(input_float_3D);
    copy.srcPitch = sizeX * Sizeof.FLOAT;
    copy.srcHeight = sizeY;
    copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copy.dstArray = array;
    copy.dstHeight = sizeX;
    copy.WidthInBytes = sizeX * Sizeof.FLOAT;
    copy.Height = sizeY;
    copy.Depth = sizeZ;
    cuMemcpy3D(copy);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float_3D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float_3D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY }),
     	Pointer.to(new float[]{ posZ })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[1];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

    // Print the results
    log("Result float  3D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 3.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float  3D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 1D float4 texture access
 */
private boolean test_float4_1D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = 1;
    ad.NumChannels = 4;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    Pointer pInput = Pointer.to(input_float4_1D);
    cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT * 4);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_1D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_1D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 1D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 0.5f, 0.5f, 0.5f, 0.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 1D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 2D float4 texture access
 */
private boolean test_float4_2D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.NumChannels = 4;
    cuArrayCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
    copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copyHD.srcHost = Pointer.to(input_float4_2D);
    copyHD.srcPitch = sizeX * Sizeof.FLOAT * 4;
    copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copyHD.dstArray = array;
    copyHD.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
    copyHD.Height = sizeY;
    cuMemcpy2D(copyHD);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_2D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_2D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 2D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 1.5f, 1.5f, 1.5f, 1.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 2D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaDriverTextureTest.java From jcuda with MIT License

4 votes

/**
 * Test the 3D float4 texture access
 */
private boolean test_float4_3D()
{
    // Create the array on the device
    CUarray array = new CUarray();
    CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
    ad.Format = CU_AD_FORMAT_FLOAT;
    ad.Width = sizeX;
    ad.Height = sizeY;
    ad.Depth = sizeZ;
    ad.NumChannels = 4;
    cuArray3DCreate(array, ad);

    // Copy the host input to the array
    CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
    copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
    copy.srcHost = Pointer.to(input_float4_3D);
    copy.srcPitch = sizeX * Sizeof.FLOAT * 4;
    copy.srcHeight = sizeY;
    copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
    copy.dstArray = array;
    copy.dstHeight = sizeX;
    copy.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
    copy.Height = sizeY;
    copy.Depth = sizeZ;
    cuMemcpy3D(copy);

    // Set up the texture reference
    CUtexref texref = new CUtexref();
    cuModuleGetTexRef(texref, module, "texture_float4_3D");
    cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
    cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
    cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
    cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
    cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

    // Prepare the output device memory
    CUdeviceptr dOutput = new CUdeviceptr();
    cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

    // Obtain the test function
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "test_float4_3D");

    // Set up the kernel parameters 
    Pointer kernelParameters = Pointer.to(
        Pointer.to(dOutput),
        Pointer.to(new float[]{ posX }),
     	Pointer.to(new float[]{ posY }),
     	Pointer.to(new float[]{ posZ })
    );

    // Call the kernel function.
    cuLaunchKernel(function, 1, 1, 1, 
    	1, 1, 1, 0, null, kernelParameters, null);
    cuCtxSynchronize();

    // Obtain the output on the host
    float hOutput[] = new float[4];
    cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

    // Print the results
    log("Result float4 3D " + Arrays.toString(hOutput));
    float expected[] = new float[]{ 3.5f, 3.5f, 3.5f, 3.5f };
    boolean passed = Arrays.equals(hOutput, expected);
    log("Test   float4 3D " + (passed ? "PASSED" : "FAILED"));

    // Clean up
    cuArrayDestroy(array);
    cuMemFree(dOutput);

    return passed;
}

Source File: JCudaConstantMemoryExample.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args) throws IOException 
{
    // Enable exceptions and omit all subsequent error checks
    JCudaDriver.setExceptionsEnabled(true);

    // Initialize the driver and create a context for the first device.
    cuInit(0);
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    CUcontext context = new CUcontext();
    cuCtxCreate(context, 0, device);

    // Create the PTX file by calling the NVCC
    String ptxFileName = JCudaSamplesUtils.preparePtxFile(
        "src/main/resources/kernels/JCudaConstantMemoryKernel.cu");

    // Load the PTX file.
    CUmodule module = new CUmodule();
    cuModuleLoad(module, ptxFileName);

    // Obtain the pointer to the constant memory, and print some info
    CUdeviceptr constantMemoryPointer = new CUdeviceptr();
    long constantMemorySizeArray[] = { 0 };
    cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray, 
        module, "constantMemoryData");
    int constantMemorySize = (int)constantMemorySizeArray[0];
    
    System.out.println("constantMemoryPointer: " + constantMemoryPointer);
    System.out.println("constantMemorySize: " + constantMemorySize);

    // Copy some host data to the constant memory
    int numElements = constantMemorySize / Sizeof.FLOAT;
    float hostData[] = new float[numElements];
    for (int i = 0; i < numElements; i++)
    {
        hostData[i] = i;
    }
    cuMemcpyHtoD(constantMemoryPointer, 
        Pointer.to(hostData), constantMemorySize);
    
    // Now use the constant memory in the kernel call:
    
    // Obtain a function pointer to the "constantMemoryKernel" function.
    CUfunction kernel = new CUfunction();
    cuModuleGetFunction(kernel, module, "constantMemoryKernel");

    // Allocate some device memory
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAlloc(deviceData, constantMemorySize);
    
    // Set up the kernel parameters
    Pointer kernelParameters = Pointer.to(
        Pointer.to(deviceData),
        Pointer.to(new int[]{numElements})
    );
    
    // Launch the kernel
    int blockSizeX = numElements;
    int gridSizeX = 1;
    cuLaunchKernel(kernel,
        gridSizeX,  1, 1, 
        blockSizeX, 1, 1,
        0, null,         
        kernelParameters, null 
    );
    cuCtxSynchronize();
    
    // Copy the result back to the host, and verify that it is
    // the same that was copied to the constant memory
    float hostResult[] = new float[numElements];
    cuMemcpyDtoH(Pointer.to(hostResult), deviceData, constantMemorySize);
    
    boolean passed = Arrays.equals(hostData,  hostResult);
    System.out.println("Test " + (passed ? "PASSED" : "FAILED"));
}

Source File: JCudaDynamicParallelism.java From jcuda-samples with MIT License

4 votes

public static void main(String[] args)
{
    JCudaDriver.setExceptionsEnabled(true);

    // Initialize a context for the first device
    cuInit(0);
    CUcontext context = new CUcontext();
    CUdevice device = new CUdevice();
    cuDeviceGet(device, 0);
    cuCtxCreate(context, 0, device);

    // Create the CUBIN file by calling the NVCC. 
    // See the prepareDefaultCubinFile method for the details about
    // the NVCC parameters that are used here. 
    String cubinFileName = JCudaSamplesUtils.prepareDefaultCubinFile(
        "src/main/resources/kernels/JCudaDynamicParallelismKernel.cu");

    // Load the CUBIN file 
    CUmodule module = new CUmodule();
    cuModuleLoad(module, cubinFileName);

    // Obtain a function pointer to the "parentKernel" function.
    CUfunction function = new CUfunction();
    cuModuleGetFunction(function, module, "parentKernel");

    // Define the nesting structure. 
    // 
    // NOTE: The number of child threads MUST match the value that 
    // is used in the kernel, for the childKernel<<<1, 8>>> call!
    // 
    int numParentThreads = 8;
    int numChildThreads = 8;

    // Allocate the device data that will be filled by the kernel
    int numElements = numParentThreads * numChildThreads;
    CUdeviceptr deviceData = new CUdeviceptr();
    cuMemAlloc(deviceData, numElements * Sizeof.FLOAT);

    // Set up the kernel parameters: A pointer to an array
    // of pointers which point to the actual values.
    Pointer kernelParameters = Pointer.to(
        Pointer.to(new int[] { numElements }),
        Pointer.to(deviceData)
    );

    // Call the kernel function.
    int blockSizeX = numParentThreads;
    int gridSizeX = (numElements + numElements - 1) / blockSizeX;
    cuLaunchKernel(function,
        gridSizeX,  1, 1,      // Grid dimension
        blockSizeX, 1, 1,      // Block dimension
        0, null,               // Shared memory size and stream
        kernelParameters, null // Kernel- and extra parameters
    );
    cuCtxSynchronize();

    // Copy the device data to the host
    float hostData[] = new float[numElements];
    for(int i = 0; i < numElements; i++)
    {
        hostData[i] = i;
    }
    cuMemcpyDtoH(Pointer.to(hostData), 
        deviceData, numElements * Sizeof.FLOAT);

    // Compare the host data with the expected values
    float hostDataRef[] = new float[numElements];
    for(int i = 0; i < numParentThreads; i++)
    {
        for (int j=0; j < numChildThreads; j++)
        {
            hostDataRef[i * numChildThreads + j] = i + 0.1f * j;
        }
    }
    System.out.println("Result: "+Arrays.toString(hostData));
    boolean passed = Arrays.equals(hostData, hostDataRef);
    System.out.println(passed ? "PASSED" : "FAILED");

    // Clean up.
    cuMemFree(deviceData);
}

jcuda.driver.CUfunction Java Examples