jcuda.driver.CUdeviceptr Java Examples
The following examples show how to use
jcuda.driver.CUdeviceptr.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JCudaTestUtils.java From jcuda with MIT License | 6 votes |
/** * Returns whether the given pointers refer to the same memory address.<br> * <br> * <b>NOTE:<b><br> * <br> * This method does NOT implement a general way for comparing arbitrary * pointers. The concept of equality of pointers is subtle, and by * default NOT implemented in the pointer classes. This method is * SOLELY intended for the test cases in which it is used. * * @param p0 The first pointer * @param p1 The second pointer * @return Whether the pointers are equal */ static boolean equal(CUdeviceptr p0, CUdeviceptr p1) { class TestCUdeviceptr extends CUdeviceptr { TestCUdeviceptr(CUdeviceptr other) { super(other); } @Override public long getNativePointer() { return super.getNativePointer(); } } TestCUdeviceptr tp0 = new TestCUdeviceptr(p0); TestCUdeviceptr tp1 = new TestCUdeviceptr(p1); return tp0.getNativePointer() == tp1.getNativePointer(); }
Example #2
Source File: GPUHelper.java From Juicebox with MIT License | 5 votes |
public static CUdeviceptr allocateInput(float[] input) { int typeSize = Sizeof.FLOAT; Pointer ptr = Pointer.to(input); int size = input.length; CUdeviceptr dInput = new CUdeviceptr(); cuMemAlloc(dInput, size * Sizeof.FLOAT); cuMemcpyHtoD(dInput, ptr, size * typeSize); return dInput; }
Example #3
Source File: JCudaDriverBasicGraphExample.java From jcuda-samples with MIT License | 5 votes |
/** * Create device data containing the given float value, the given number * of times * * @param numElements The number of elements * @param value The value of the elements * @return The pointer to the data */ private static CUdeviceptr createDeviceData(int numElements, float value) { float hostData[] = new float[numElements]; for (int i = 0; i < numElements; i++) { hostData[i] = value; } CUdeviceptr deviceData = new CUdeviceptr(); cuMemAlloc(deviceData, numElements * Sizeof.FLOAT); cuMemcpyHtoD(deviceData, Pointer.to(hostData), numElements * Sizeof.FLOAT); return deviceData; }
Example #4
Source File: JCudaReduction.java From jcuda-samples with MIT License | 5 votes |
/** * Initialize the context, module, function and other elements used * in this sample */ private static void init() { // Initialize the driver API and create a context for the first device cuInit(0); CUdevice device = new CUdevice(); cuDeviceGet(device, 0); context = new CUcontext(); cuCtxCreate(context, 0, device); // Create the PTX file by calling the NVCC String ptxFileName = JCudaSamplesUtils.preparePtxFile( "src/main/resources/kernels/JCudaReductionKernel.cu"); // Load the module from the PTX file module = new CUmodule(); cuModuleLoad(module, ptxFileName); // Obtain a function pointer to the "reduce" function. function = new CUfunction(); cuModuleGetFunction(function, module, "reduce"); // Allocate a chunk of temporary memory (must be at least // numberOfBlocks * Sizeof.FLOAT) deviceBuffer = new CUdeviceptr(); cuMemAlloc(deviceBuffer, 1024 * Sizeof.FLOAT); }
Example #5
Source File: GPUHelper.java From JuiceboxLegacy with MIT License | 5 votes |
public static CUdeviceptr allocateInput(float[] input) { int typeSize = Sizeof.FLOAT; Pointer ptr = Pointer.to(input); int size = input.length; CUdeviceptr dInput = new CUdeviceptr(); cuMemAlloc(dInput, size * Sizeof.FLOAT); cuMemcpyHtoD(dInput, ptr, size * typeSize); return dInput; }
Example #6
Source File: CUDAInnerLoop.java From ocular with GNU General Public License v3.0 | 5 votes |
public void startup(float[][] whiteTemplates, float[][] blackTemplates, int[] templateNumIndices, int[] templateIndicesOffsets, int minTemplateWidth, int maxTemplateWidth, int maxSequenceLength, int totalTemplateNumIndices) { this.whiteTemplates = whiteTemplates; this.blackTemplates = blackTemplates; this.templateNumIndices = templateNumIndices; this.templateIndicesOffsets = templateIndicesOffsets; this.maxTemplateWidth = maxTemplateWidth; this.minTemplateWidth = minTemplateWidth; this.totalTemplateNumIndices = totalTemplateNumIndices; int numTemplateWidths = (maxTemplateWidth-minTemplateWidth)+1; int extendedMaxSeqLength = (BLOCK_SIZE_X*ROLL_X) * (int) Math.ceil(((double) maxSequenceLength) / (BLOCK_SIZE_X*ROLL_X)); this.d_Ow = new CUdeviceptr(); cuMemAlloc(d_Ow, (extendedMaxSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT); this.d_Ob = new CUdeviceptr(); cuMemAlloc(d_Ob, (extendedMaxSeqLength+maxTemplateWidth-1)*CharacterTemplate.LINE_HEIGHT * Sizeof.FLOAT); this.d_scores = new CUdeviceptr(); cuMemAlloc(d_scores, maxSequenceLength*totalTemplateNumIndices * Sizeof.FLOAT); this.d_Tw = new CUdeviceptr[numTemplateWidths]; this.d_Tb = new CUdeviceptr[numTemplateWidths]; for (int tw=minTemplateWidth; tw<=maxTemplateWidth; ++tw) { if (templateNumIndices[tw-minTemplateWidth] > 0) { d_Tw[tw-minTemplateWidth] = new CUdeviceptr(); cuMemAlloc(d_Tw[tw-minTemplateWidth], whiteTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT); cuMemcpyHtoD(d_Tw[tw-minTemplateWidth], Pointer.to(whiteTemplates[tw-minTemplateWidth]), whiteTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT); d_Tb[tw-minTemplateWidth] = new CUdeviceptr(); cuMemAlloc(d_Tb[tw-minTemplateWidth], blackTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT); cuMemcpyHtoD(d_Tb[tw-minTemplateWidth], Pointer.to(blackTemplates[tw-minTemplateWidth]), blackTemplates[tw-minTemplateWidth].length * Sizeof.FLOAT); } } }
Example #7
Source File: Context.java From OSPREY3 with GNU General Public License v2.0 | 4 votes |
public void free(CUdeviceptr pdBuf) { JCudaDriver.cuMemFree(pdBuf); }
Example #8
Source File: JCudaDriverMemRangeTest.java From jcuda with MIT License | 4 votes |
@Test public void testMemRangeAttribute() { JCudaDriver.setExceptionsEnabled(true); cuInit(0); CUcontext contest = new CUcontext(); CUdevice device = new CUdevice(); cuDeviceGet(device, 0); cuCtxCreate(contest, 0, device); int size = 64; CUdeviceptr deviceData = new CUdeviceptr(); cuMemAllocManaged(deviceData, size, CU_MEM_ATTACH_HOST); int readMostly[] = { 12345 }; int lastPrefetchLocation[] = { 12345 }; int preferredLocation[] = { 12345 }; int accessedBy[] = { 12345, 12345, 12345 }; cuMemRangeGetAttribute(Pointer.to(readMostly), Sizeof.INT, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, deviceData, size); cuMemRangeGetAttribute(Pointer.to(lastPrefetchLocation), Sizeof.INT, CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, deviceData, size); cuMemRangeGetAttribute(Pointer.to(preferredLocation), Sizeof.INT, CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, deviceData, size); cuMemRangeGetAttribute( Pointer.to(accessedBy), Sizeof.INT * accessedBy.length, CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, deviceData, size); boolean printResults = false; //printResults = true; if (printResults) { System.out.println("readMostly : " + Arrays.toString(lastPrefetchLocation)); System.out.println("lastPrefetchLocation: " + Arrays.toString(lastPrefetchLocation)); System.out.println("preferredLocation : " + Arrays.toString(preferredLocation)); System.out.println("accessedBy : " + Arrays.toString(accessedBy)); } }
Example #9
Source File: JCudaDriverMemRangeTest.java From jcuda with MIT License | 4 votes |
@Test public void testMemRangeAttributes() { JCudaDriver.setExceptionsEnabled(true); cuInit(0); CUcontext contest = new CUcontext(); CUdevice device = new CUdevice(); cuDeviceGet(device, 0); cuCtxCreate(contest, 0, device); int size = 64; CUdeviceptr deviceData = new CUdeviceptr(); cuMemAllocManaged(deviceData, size, CU_MEM_ATTACH_HOST); int readMostly[] = { 12345 }; int lastPrefetchLocation[] = { 12345 }; int preferredLocation[] = { 12345 }; int accessedBy[] = { 12345, 12345, 12345 }; Pointer data[] = { Pointer.to(readMostly), Pointer.to(lastPrefetchLocation), Pointer.to(preferredLocation), Pointer.to(accessedBy) }; long dataSizes[] = { Sizeof.INT, Sizeof.INT, Sizeof.INT, Sizeof.INT * accessedBy.length }; int attributes[] = { CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY, CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION, CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, }; cuMemRangeGetAttributes(data, dataSizes, attributes, attributes.length, deviceData, size); boolean printResults = false; //printResults = true; if (printResults) { System.out.println("readMostly : " + Arrays.toString(lastPrefetchLocation)); System.out.println("lastPrefetchLocation: " + Arrays.toString(lastPrefetchLocation)); System.out.println("preferredLocation : " + Arrays.toString(preferredLocation)); System.out.println("accessedBy : " + Arrays.toString(accessedBy)); } }
Example #10
Source File: GPUTesting.java From JuiceboxLegacy with MIT License | 4 votes |
public static void test() { JCudaDriver.setExceptionsEnabled(true); String sourceCode = "extern \"C\"" + "\n" + "__global__ void add(float *result, float *a, float *b)" + "\n" + "{" + "\n" + " int i = threadIdx.x;" + "\n" + " result[i] = a[i] + b[i];" + "\n" + "}"; // Prepare the kernel System.out.println("Preparing the KernelLauncher..."); KernelLauncher kernelLauncher = KernelLauncher.compile(sourceCode, "add"); // Create the input data System.out.println("Creating input data..."); int size = 10; float result[] = new float[size]; float a[] = new float[size]; float b[] = new float[size]; for (int i = 0; i < size; i++) { a[i] = i; b[i] = i; } // Allocate the device memory and copy the input // data to the device System.out.println("Initializing device memory..."); CUdeviceptr dResult = GPUHelper.allocateOutput(size, Sizeof.FLOAT); CUdeviceptr dA = GPUHelper.allocateInput(a); CUdeviceptr dB = GPUHelper.allocateInput(b); System.out.println("Calling the kernel..."); kernelLauncher.setBlockSize(size, 1, 1); kernelLauncher.call(dResult, dA, dB); // Copy the result from the device to the host System.out.println("Obtaining results..."); cuMemcpyDtoH(Pointer.to(result), dResult, size * Sizeof.FLOAT); System.out.println("Result: " + Arrays.toString(result)); // Clean up cuMemFree(dA); cuMemFree(dB); cuMemFree(dResult); }
Example #11
Source File: GPUHelper.java From JuiceboxLegacy with MIT License | 4 votes |
public static CUdeviceptr allocateOutput(int size, int typeSize) { CUdeviceptr dOutput = new CUdeviceptr(); cuMemAlloc(dOutput, size * typeSize); return dOutput; }
Example #12
Source File: GPUHelper.java From JuiceboxLegacy with MIT License | 4 votes |
public static void freeUpMemory(CUdeviceptr[] pointers) { for (CUdeviceptr pointer : pointers) { cuMemFree(pointer); } }
Example #13
Source File: Context.java From OSPREY3 with GNU General Public License v2.0 | 4 votes |
public CUdeviceptr malloc(long numBytes) { CUdeviceptr pdBuf = new CUdeviceptr(); JCudaDriver.cuMemAlloc(pdBuf, numBytes); return pdBuf; }
Example #14
Source File: JCudaDriverTextureTest.java From jcuda with MIT License | 4 votes |
/** * Test the 3D float4 texture access */ private boolean test_float4_3D() { // Create the array on the device CUarray array = new CUarray(); CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR(); ad.Format = CU_AD_FORMAT_FLOAT; ad.Width = sizeX; ad.Height = sizeY; ad.Depth = sizeZ; ad.NumChannels = 4; cuArray3DCreate(array, ad); // Copy the host input to the array CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D(); copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST; copy.srcHost = Pointer.to(input_float4_3D); copy.srcPitch = sizeX * Sizeof.FLOAT * 4; copy.srcHeight = sizeY; copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY; copy.dstArray = array; copy.dstHeight = sizeX; copy.WidthInBytes = sizeX * Sizeof.FLOAT * 4; copy.Height = sizeY; copy.Depth = sizeZ; cuMemcpy3D(copy); // Set up the texture reference CUtexref texref = new CUtexref(); cuModuleGetTexRef(texref, module, "texture_float4_3D"); cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR); cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES); cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4); cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT); // Prepare the output device memory CUdeviceptr dOutput = new CUdeviceptr(); cuMemAlloc(dOutput, Sizeof.FLOAT * 4); // Obtain the test function CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "test_float4_3D"); // Set up the kernel parameters Pointer kernelParameters = Pointer.to( Pointer.to(dOutput), Pointer.to(new float[]{ posX }), Pointer.to(new float[]{ posY }), Pointer.to(new float[]{ posZ }) ); // Call the kernel function. cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, null, kernelParameters, null); cuCtxSynchronize(); // Obtain the output on the host float hOutput[] = new float[4]; cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4); // Print the results log("Result float4 3D " + Arrays.toString(hOutput)); float expected[] = new float[]{ 3.5f, 3.5f, 3.5f, 3.5f }; boolean passed = Arrays.equals(hOutput, expected); log("Test float4 3D " + (passed ? "PASSED" : "FAILED")); // Clean up cuArrayDestroy(array); cuMemFree(dOutput); return passed; }
Example #15
Source File: Context.java From OSPREY3 with GNU General Public License v2.0 | 4 votes |
public void uploadAsync(CUdeviceptr pdBuf, Pointer phBuf, long numBytes, GpuStream stream) { JCudaDriver.cuMemcpyHtoDAsync(pdBuf, phBuf, numBytes, stream.getStream()); }
Example #16
Source File: Context.java From OSPREY3 with GNU General Public License v2.0 | 4 votes |
public void downloadAsync(Pointer phBuf, CUdeviceptr pdBuf, long numBytes, GpuStream stream) { JCudaDriver.cuMemcpyDtoHAsync(phBuf, pdBuf, numBytes, stream.getStream()); }
Example #17
Source File: GPUTesting.java From Juicebox with MIT License | 4 votes |
public static void test() { JCudaDriver.setExceptionsEnabled(true); String sourceCode = "extern \"C\"" + "\n" + "__global__ void add(float *result, float *a, float *b)" + "\n" + "{" + "\n" + " int i = threadIdx.x;" + "\n" + " result[i] = a[i] + b[i];" + "\n" + "}"; // Prepare the kernel System.out.println("Preparing the KernelLauncher..."); KernelLauncher kernelLauncher = KernelLauncher.compile(sourceCode, "add"); // Create the input data System.out.println("Creating input data..."); int size = 10; float[] result = new float[size]; float[] a = new float[size]; float[] b = new float[size]; for (int i = 0; i < size; i++) { a[i] = i; b[i] = i; } // Allocate the device memory and copy the input // data to the device System.out.println("Initializing device memory..."); CUdeviceptr dResult = GPUHelper.allocateOutput(size, Sizeof.FLOAT); CUdeviceptr dA = GPUHelper.allocateInput(a); CUdeviceptr dB = GPUHelper.allocateInput(b); System.out.println("Calling the kernel..."); kernelLauncher.setBlockSize(size, 1, 1); kernelLauncher.call(dResult, dA, dB); // Copy the result from the device to the host System.out.println("Obtaining results..."); cuMemcpyDtoH(Pointer.to(result), dResult, size * Sizeof.FLOAT); System.out.println("Result: " + Arrays.toString(result)); // Clean up cuMemFree(dA); cuMemFree(dB); cuMemFree(dResult); }
Example #18
Source File: GPUHelper.java From Juicebox with MIT License | 4 votes |
public static CUdeviceptr allocateOutput(int size, int typeSize) { CUdeviceptr dOutput = new CUdeviceptr(); cuMemAlloc(dOutput, size * typeSize); return dOutput; }
Example #19
Source File: GPUHelper.java From Juicebox with MIT License | 4 votes |
public static void freeUpMemory(CUdeviceptr[] pointers) { for (CUdeviceptr pointer : pointers) { cuMemFree(pointer); } }
Example #20
Source File: VecFloatSample.java From jcuda-samples with MIT License | 4 votes |
public static void main(String[] args) { // Enable exceptions and omit all subsequent error checks JCudaDriver.setExceptionsEnabled(true); // Initialize the driver and create a context for the first device. cuInit(0); CUdevice device = new CUdevice(); cuDeviceGet(device, 0); CUcontext context = new CUcontext(); cuCtxCreate(context, 0, device); // Afterwards, initialize the vector library, which will // attach to the current context VecFloat.init(); // Allocate and fill the host input data int n = 50000; float hostX[] = new float[n]; float hostY[] = new float[n]; for(int i = 0; i < n; i++) { hostX[i] = (float)i; hostY[i] = (float)i; } // Allocate the device pointers, and copy the // host input data to the device CUdeviceptr deviceX = new CUdeviceptr(); cuMemAlloc(deviceX, n * Sizeof.FLOAT); cuMemcpyHtoD(deviceX, Pointer.to(hostX), n * Sizeof.FLOAT); CUdeviceptr deviceY = new CUdeviceptr(); cuMemAlloc(deviceY, n * Sizeof.FLOAT); cuMemcpyHtoD(deviceY, Pointer.to(hostY), n * Sizeof.FLOAT); CUdeviceptr deviceResult = new CUdeviceptr(); cuMemAlloc(deviceResult, n * Sizeof.FLOAT); // Perform the vector operations VecFloat.cos(n, deviceX, deviceX); // x = cos(x) VecFloat.mul(n, deviceX, deviceX, deviceX); // x = x*x VecFloat.sin(n, deviceY, deviceY); // y = sin(y) VecFloat.mul(n, deviceY, deviceY, deviceY); // y = y*y VecFloat.add(n, deviceResult, deviceX, deviceY); // result = x+y // Allocate host output memory and copy the device output // to the host. float hostResult[] = new float[n]; cuMemcpyDtoH(Pointer.to(hostResult), deviceResult, n * Sizeof.FLOAT); // Verify the result boolean passed = true; for(int i = 0; i < n; i++) { float expected = (float)( Math.cos(hostX[i])*Math.cos(hostX[i])+ Math.sin(hostY[i])*Math.sin(hostY[i])); if (Math.abs(hostResult[i] - expected) > 1e-5) { System.out.println( "At index "+i+ " found "+hostResult[i]+ " but expected "+expected); passed = false; break; } } System.out.println("Test "+(passed?"PASSED":"FAILED")); // Clean up. cuMemFree(deviceX); cuMemFree(deviceY); cuMemFree(deviceResult); VecFloat.shutdown(); }
Example #21
Source File: JCudaDriverTextureTest.java From jcuda with MIT License | 4 votes |
/** * Test the 2D float4 texture access */ private boolean test_float4_2D() { // Create the array on the device CUarray array = new CUarray(); CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR(); ad.Format = CU_AD_FORMAT_FLOAT; ad.Width = sizeX; ad.Height = sizeY; ad.NumChannels = 4; cuArrayCreate(array, ad); // Copy the host input to the array CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D(); copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST; copyHD.srcHost = Pointer.to(input_float4_2D); copyHD.srcPitch = sizeX * Sizeof.FLOAT * 4; copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY; copyHD.dstArray = array; copyHD.WidthInBytes = sizeX * Sizeof.FLOAT * 4; copyHD.Height = sizeY; cuMemcpy2D(copyHD); // Set up the texture reference CUtexref texref = new CUtexref(); cuModuleGetTexRef(texref, module, "texture_float4_2D"); cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR); cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES); cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4); cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT); // Prepare the output device memory CUdeviceptr dOutput = new CUdeviceptr(); cuMemAlloc(dOutput, Sizeof.FLOAT * 4); // Obtain the test function CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "test_float4_2D"); // Set up the kernel parameters Pointer kernelParameters = Pointer.to( Pointer.to(dOutput), Pointer.to(new float[]{ posX }), Pointer.to(new float[]{ posY }) ); // Call the kernel function. cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, null, kernelParameters, null); cuCtxSynchronize(); // Obtain the output on the host float hOutput[] = new float[4]; cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4); // Print the results log("Result float4 2D " + Arrays.toString(hOutput)); float expected[] = new float[]{ 1.5f, 1.5f, 1.5f, 1.5f }; boolean passed = Arrays.equals(hOutput, expected); log("Test float4 2D " + (passed ? "PASSED" : "FAILED")); // Clean up cuArrayDestroy(array); cuMemFree(dOutput); return passed; }
Example #22
Source File: JCudaDriverTextureTest.java From jcuda with MIT License | 4 votes |
/** * Test the 1D float4 texture access */ private boolean test_float4_1D() { // Create the array on the device CUarray array = new CUarray(); CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR(); ad.Format = CU_AD_FORMAT_FLOAT; ad.Width = sizeX; ad.Height = 1; ad.NumChannels = 4; cuArrayCreate(array, ad); // Copy the host input to the array Pointer pInput = Pointer.to(input_float4_1D); cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT * 4); // Set up the texture reference CUtexref texref = new CUtexref(); cuModuleGetTexRef(texref, module, "texture_float4_1D"); cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR); cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES); cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4); cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT); // Prepare the output device memory CUdeviceptr dOutput = new CUdeviceptr(); cuMemAlloc(dOutput, Sizeof.FLOAT * 4); // Obtain the test function CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "test_float4_1D"); // Set up the kernel parameters Pointer kernelParameters = Pointer.to( Pointer.to(dOutput), Pointer.to(new float[]{ posX }) ); // Call the kernel function. cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, null, kernelParameters, null); cuCtxSynchronize(); // Obtain the output on the host float hOutput[] = new float[4]; cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4); // Print the results log("Result float4 1D " + Arrays.toString(hOutput)); float expected[] = new float[]{ 0.5f, 0.5f, 0.5f, 0.5f }; boolean passed = Arrays.equals(hOutput, expected); log("Test float4 1D " + (passed ? "PASSED" : "FAILED")); // Clean up cuArrayDestroy(array); cuMemFree(dOutput); return passed; }
Example #23
Source File: JCudaDriverTextureTest.java From jcuda with MIT License | 4 votes |
/** * Test the 3D float texture access */ private boolean test_float_3D() { // Create the array on the device CUarray array = new CUarray(); CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR(); ad.Format = CU_AD_FORMAT_FLOAT; ad.Width = sizeX; ad.Height = sizeY; ad.Depth = sizeZ; ad.NumChannels = 1; cuArray3DCreate(array, ad); // Copy the host input to the array CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D(); copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST; copy.srcHost = Pointer.to(input_float_3D); copy.srcPitch = sizeX * Sizeof.FLOAT; copy.srcHeight = sizeY; copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY; copy.dstArray = array; copy.dstHeight = sizeX; copy.WidthInBytes = sizeX * Sizeof.FLOAT; copy.Height = sizeY; copy.Depth = sizeZ; cuMemcpy3D(copy); // Set up the texture reference CUtexref texref = new CUtexref(); cuModuleGetTexRef(texref, module, "texture_float_3D"); cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR); cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES); cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1); cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT); // Prepare the output device memory CUdeviceptr dOutput = new CUdeviceptr(); cuMemAlloc(dOutput, Sizeof.FLOAT * 1); // Obtain the test function CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "test_float_3D"); // Set up the kernel parameters Pointer kernelParameters = Pointer.to( Pointer.to(dOutput), Pointer.to(new float[]{ posX }), Pointer.to(new float[]{ posY }), Pointer.to(new float[]{ posZ }) ); // Call the kernel function. cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, null, kernelParameters, null); cuCtxSynchronize(); // Obtain the output on the host float hOutput[] = new float[1]; cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1); // Print the results log("Result float 3D " + Arrays.toString(hOutput)); float expected[] = new float[]{ 3.5f }; boolean passed = Arrays.equals(hOutput, expected); log("Test float 3D " + (passed ? "PASSED" : "FAILED")); // Clean up cuArrayDestroy(array); cuMemFree(dOutput); return passed; }
Example #24
Source File: JCudaDriverTextureTest.java From jcuda with MIT License | 4 votes |
/** * Test the 2D float texture access */ private boolean test_float_2D() { // Create the array on the device CUarray array = new CUarray(); CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR(); ad.Format = CU_AD_FORMAT_FLOAT; ad.Width = sizeX; ad.Height = sizeY; ad.NumChannels = 1; cuArrayCreate(array, ad); // Copy the host input to the array CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D(); copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST; copyHD.srcHost = Pointer.to(input_float_2D); copyHD.srcPitch = sizeX * Sizeof.FLOAT; copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY; copyHD.dstArray = array; copyHD.WidthInBytes = sizeX * Sizeof.FLOAT; copyHD.Height = sizeY; cuMemcpy2D(copyHD); // Set up the texture reference CUtexref texref = new CUtexref(); cuModuleGetTexRef(texref, module, "texture_float_2D"); cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR); cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES); cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1); cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT); // Prepare the output device memory CUdeviceptr dOutput = new CUdeviceptr(); cuMemAlloc(dOutput, Sizeof.FLOAT * 1); // Obtain the test function CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "test_float_2D"); // Set up the kernel parameters Pointer kernelParameters = Pointer.to( Pointer.to(dOutput), Pointer.to(new float[]{ posX }), Pointer.to(new float[]{ posY }) ); // Call the kernel function. cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, null, kernelParameters, null); cuCtxSynchronize(); // Obtain the output on the host float hOutput[] = new float[1]; cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1); // Print the results log("Result float 2D " + Arrays.toString(hOutput)); float expected[] = new float[]{ 1.5f }; boolean passed = Arrays.equals(hOutput, expected); log("Test float 2D " + (passed ? "PASSED" : "FAILED")); // Clean up cuArrayDestroy(array); cuMemFree(dOutput); return passed; }
Example #25
Source File: JCudaDriverTextureTest.java From jcuda with MIT License | 4 votes |
/** * Test the 1D float texture access */ private boolean test_float_1D() { // Create the array on the device CUarray array = new CUarray(); CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR(); ad.Format = CU_AD_FORMAT_FLOAT; ad.Width = sizeX; ad.Height = 1; ad.NumChannels = 1; cuArrayCreate(array, ad); // Copy the host input to the array Pointer pInput = Pointer.to(input_float_1D); cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT); // Set up the texture reference CUtexref texref = new CUtexref(); cuModuleGetTexRef(texref, module, "texture_float_1D"); cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR); cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP); cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES); cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1); cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT); // Prepare the output device memory CUdeviceptr dOutput = new CUdeviceptr(); cuMemAlloc(dOutput, Sizeof.FLOAT * 1); // Obtain the test function CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "test_float_1D"); // Set up the kernel parameters Pointer kernelParameters = Pointer.to( Pointer.to(dOutput), Pointer.to(new float[]{ posX }) ); // Call the kernel function. cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, null, kernelParameters, null); cuCtxSynchronize(); // Obtain the output on the host float hOutput[] = new float[1]; cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1); // Print the results log("Result float 1D " + Arrays.toString(hOutput)); float expected[] = new float[]{ 0.5f }; boolean passed = Arrays.equals(hOutput, expected); log("Test float 1D " + (passed ? "PASSED" : "FAILED")); // Clean up cuArrayDestroy(array); cuMemFree(dOutput); return passed; }
Example #26
Source File: JCudaConstantMemoryExample.java From jcuda-samples with MIT License | 4 votes |
public static void main(String[] args) throws IOException { // Enable exceptions and omit all subsequent error checks JCudaDriver.setExceptionsEnabled(true); // Initialize the driver and create a context for the first device. cuInit(0); CUdevice device = new CUdevice(); cuDeviceGet(device, 0); CUcontext context = new CUcontext(); cuCtxCreate(context, 0, device); // Create the PTX file by calling the NVCC String ptxFileName = JCudaSamplesUtils.preparePtxFile( "src/main/resources/kernels/JCudaConstantMemoryKernel.cu"); // Load the PTX file. CUmodule module = new CUmodule(); cuModuleLoad(module, ptxFileName); // Obtain the pointer to the constant memory, and print some info CUdeviceptr constantMemoryPointer = new CUdeviceptr(); long constantMemorySizeArray[] = { 0 }; cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray, module, "constantMemoryData"); int constantMemorySize = (int)constantMemorySizeArray[0]; System.out.println("constantMemoryPointer: " + constantMemoryPointer); System.out.println("constantMemorySize: " + constantMemorySize); // Copy some host data to the constant memory int numElements = constantMemorySize / Sizeof.FLOAT; float hostData[] = new float[numElements]; for (int i = 0; i < numElements; i++) { hostData[i] = i; } cuMemcpyHtoD(constantMemoryPointer, Pointer.to(hostData), constantMemorySize); // Now use the constant memory in the kernel call: // Obtain a function pointer to the "constantMemoryKernel" function. CUfunction kernel = new CUfunction(); cuModuleGetFunction(kernel, module, "constantMemoryKernel"); // Allocate some device memory CUdeviceptr deviceData = new CUdeviceptr(); cuMemAlloc(deviceData, constantMemorySize); // Set up the kernel parameters Pointer kernelParameters = Pointer.to( Pointer.to(deviceData), Pointer.to(new int[]{numElements}) ); // Launch the kernel int blockSizeX = numElements; int gridSizeX = 1; cuLaunchKernel(kernel, gridSizeX, 1, 1, blockSizeX, 1, 1, 0, null, kernelParameters, null ); cuCtxSynchronize(); // Copy the result back to the host, and verify that it is // the same that was copied to the constant memory float hostResult[] = new float[numElements]; cuMemcpyDtoH(Pointer.to(hostResult), deviceData, constantMemorySize); boolean passed = Arrays.equals(hostData, hostResult); System.out.println("Test " + (passed ? "PASSED" : "FAILED")); }
Example #27
Source File: JCudaDriverUnifiedMemory.java From jcuda-samples with MIT License | 4 votes |
public static void main(String[] args) { JCudaDriver.setExceptionsEnabled(true); JCublas.setExceptionsEnabled(true); // Initialize the driver and create a context for the first device. cuInit(0); CUdevice device = new CUdevice(); cuDeviceGet(device, 0); CUcontext context = new CUcontext(); cuCtxCreate(context, 0, device); // Check if the device supports managed memory int supported[] = { 0 }; cuDeviceGetAttribute(supported, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, device); if (supported[0] == 0) { System.err.println("Device does not support managed memory"); return; } // Allocate managed memory that is accessible to the host int n = 10; long size = n * Sizeof.FLOAT; CUdeviceptr p = new CUdeviceptr(); cuMemAllocManaged(p, size, CU_MEM_ATTACH_HOST); // Obtain the byte buffer from the pointer. This is supported only // for memory that was allocated to be accessible on the host: ByteBuffer bb = p.getByteBuffer(0, size); System.out.println("Buffer on host side: " + bb); // Fill the buffer with sample data FloatBuffer fb = bb.order(ByteOrder.nativeOrder()).asFloatBuffer(); for (int i = 0; i < n; i++) { fb.put(i, i); } // Make the buffer accessible to all devices cuStreamAttachMemAsync(null, p, 0, CU_MEM_ATTACH_GLOBAL); cuStreamSynchronize(null); // Use the pointer in a device operation (here, a dot product with // JCublas, for example). The data that was filled in by the host // will now be used by the device. cublasHandle handle = new cublasHandle(); cublasCreate(handle); float result[] = { -1.0f }; cublasSdot(handle, n, p, 1, p, 1, Pointer.to(result)); System.out.println("Result: " + result[0]); }
Example #28
Source File: JCudaDynamicParallelism.java From jcuda-samples with MIT License | 4 votes |
public static void main(String[] args) { JCudaDriver.setExceptionsEnabled(true); // Initialize a context for the first device cuInit(0); CUcontext context = new CUcontext(); CUdevice device = new CUdevice(); cuDeviceGet(device, 0); cuCtxCreate(context, 0, device); // Create the CUBIN file by calling the NVCC. // See the prepareDefaultCubinFile method for the details about // the NVCC parameters that are used here. String cubinFileName = JCudaSamplesUtils.prepareDefaultCubinFile( "src/main/resources/kernels/JCudaDynamicParallelismKernel.cu"); // Load the CUBIN file CUmodule module = new CUmodule(); cuModuleLoad(module, cubinFileName); // Obtain a function pointer to the "parentKernel" function. CUfunction function = new CUfunction(); cuModuleGetFunction(function, module, "parentKernel"); // Define the nesting structure. // // NOTE: The number of child threads MUST match the value that // is used in the kernel, for the childKernel<<<1, 8>>> call! // int numParentThreads = 8; int numChildThreads = 8; // Allocate the device data that will be filled by the kernel int numElements = numParentThreads * numChildThreads; CUdeviceptr deviceData = new CUdeviceptr(); cuMemAlloc(deviceData, numElements * Sizeof.FLOAT); // Set up the kernel parameters: A pointer to an array // of pointers which point to the actual values. Pointer kernelParameters = Pointer.to( Pointer.to(new int[] { numElements }), Pointer.to(deviceData) ); // Call the kernel function. int blockSizeX = numParentThreads; int gridSizeX = (numElements + numElements - 1) / blockSizeX; cuLaunchKernel(function, gridSizeX, 1, 1, // Grid dimension blockSizeX, 1, 1, // Block dimension 0, null, // Shared memory size and stream kernelParameters, null // Kernel- and extra parameters ); cuCtxSynchronize(); // Copy the device data to the host float hostData[] = new float[numElements]; for(int i = 0; i < numElements; i++) { hostData[i] = i; } cuMemcpyDtoH(Pointer.to(hostData), deviceData, numElements * Sizeof.FLOAT); // Compare the host data with the expected values float hostDataRef[] = new float[numElements]; for(int i = 0; i < numParentThreads; i++) { for (int j=0; j < numChildThreads; j++) { hostDataRef[i * numChildThreads + j] = i + 0.1f * j; } } System.out.println("Result: "+Arrays.toString(hostData)); boolean passed = Arrays.equals(hostData, hostDataRef); System.out.println(passed ? "PASSED" : "FAILED"); // Clean up. cuMemFree(deviceData); }
Example #29
Source File: JCudaDriverStreamCallbacks.java From jcuda-samples with MIT License | 4 votes |
/** * Create a Workload instance. This method is called by multiple host * threads, to create the individual workloads, and to send the * commands for processing the workloads to CUDA * * @param index The index of the workload * @param executor The executor service */ private static void createWorkloadOnHost( final int index, final ExecutorService executor) { // Make sure that the CUDA context is current for the calling thread cuCtxSetCurrent(context); // Initialize the workload, and create the CUDA stream System.out.println(index + ": Initializing workload"); final Workload workload = new Workload(); workload.index = index; workload.stream = new CUstream(); cuStreamCreate(workload.stream, 0); // Create the host data of the workload System.out.println(index + ": Create host data"); workload.hostData = new Pointer(); cuMemHostAlloc(workload.hostData, WORKLOAD_SIZE * Sizeof.INT, 0); ByteBuffer hostByteBuffer = workload.hostData.getByteBuffer(0, WORKLOAD_SIZE * Sizeof.INT); IntBuffer hostIntBuffer = hostByteBuffer.order(ByteOrder.nativeOrder()).asIntBuffer(); for (int i = 0; i < WORKLOAD_SIZE; i++) { hostIntBuffer.put(i, i); } workload.deviceData = new CUdeviceptr(); cuMemAlloc(workload.deviceData, WORKLOAD_SIZE * Sizeof.INT); // Execute the CUDA commands: // - Copy the host data to the device // - Execute the kernel // - Copy the modified device data back to the host // All this is done asynchronously System.out.println(index + ": Execute CUDA commands"); cuMemcpyHtoDAsync(workload.deviceData, workload.hostData, WORKLOAD_SIZE * Sizeof.INT, workload.stream); Pointer kernelParameters = Pointer.to( Pointer.to(new int[]{WORKLOAD_SIZE}), Pointer.to(workload.deviceData) ); int blockSizeX = 256; int gridSizeX = (WORKLOAD_SIZE + blockSizeX - 1) / blockSizeX; cuLaunchKernel(function, gridSizeX, 1, 1, blockSizeX, 1, 1, 0, workload.stream, kernelParameters, null); cuMemcpyDtoHAsync(workload.hostData, workload.deviceData, WORKLOAD_SIZE * Sizeof.INT, workload.stream); // Define the callback that will be called when all CUDA commands // on the stream have finished. This callback will forward the // workload to the "finishWorkloadOnHost" method. CUstreamCallback callback = new CUstreamCallback() { @Override public void call( CUstream hStream, int status, final Object userData) { System.out.println(index + ": Callback was called"); Runnable runnable = new Runnable() { @Override public void run() { finishWorkloadOnHost(userData); } }; executor.submit(runnable); } }; cuStreamAddCallback(workload.stream, callback, workload, 0); }
Example #30
Source File: JCudaReduction.java From jcuda-samples with MIT License | 4 votes |
/** * Entry point of this sample * * @param args Not used */ public static void main(String args[]) { // Enable exceptions and omit all subsequent error checks JCudaDriver.setExceptionsEnabled(true); init(); boolean passed = true; for (int n = 100000; n <= 26500000; n *= 2) { float hostInput[] = createRandomArray(n); long timeNs0 = 0; long timeNs1 = 0; // Copy the input data to the device timeNs0 = System.nanoTime(); CUdeviceptr deviceInput = new CUdeviceptr(); cuMemAlloc(deviceInput, hostInput.length * Sizeof.FLOAT); cuMemcpyHtoD(deviceInput, Pointer.to(hostInput), hostInput.length * Sizeof.FLOAT); timeNs1 = System.nanoTime(); long durationCopyNs = timeNs1 - timeNs0; // Execute the reduction with CUDA timeNs0 = System.nanoTime(); float resultJCuda = reduce(deviceInput, hostInput.length); timeNs1 = System.nanoTime(); long durationCompNs = timeNs1 - timeNs0; cuMemFree(deviceInput); // Execute the reduction with Java timeNs0 = System.nanoTime(); float resultJava = reduceHost(hostInput); timeNs1 = System.nanoTime(); long durationJavaNs = timeNs1 - timeNs0; System.out.println("Reduction of " + n + " elements"); System.out.printf(Locale.ENGLISH, " JCuda: %7.3f ms, result: %f " + "(copy: %7.3f ms, comp: %7.3f ms)\n", (durationCopyNs + durationCompNs) / 1e6, resultJCuda, durationCopyNs / 1e6, durationCompNs / 1e6); System.out.printf(Locale.ENGLISH, " Java : %7.3f ms, result: %f\n", durationJavaNs / 1e6, resultJava); passed &= Math.abs(resultJCuda - resultJava) < resultJava * 1e-5; } System.out.println("Test " + (passed ? "PASSED" : "FAILED")); shutdown(); }